blob: 06338fac2b287e4cf605f87a92d0c0c79987b9a6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010045#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050047#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070048#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Larry Hastings61272b72014-01-07 12:41:53 -080054/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090055class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080056[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090057/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
58
59/*[python input]
60class Py_UCS4_converter(CConverter):
61 type = 'Py_UCS4'
62 converter = 'convert_uc'
63
64 def converter_init(self):
65 if self.default is not unspecified:
66 self.c_default = ascii(self.default)
67 if len(self.c_default) > 4 or self.c_default[0] != "'":
68 self.c_default = hex(ord(self.default))
69
70[python start generated code]*/
71/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000073/* --- Globals ------------------------------------------------------------
74
Serhiy Storchaka05997252013-01-26 12:14:02 +020075NOTE: In the interpreter's initialization phase, some globals are currently
76 initialized dynamically as needed. In the process Unicode objects may
77 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000078
79*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000081
82#ifdef __cplusplus
83extern "C" {
84#endif
85
Victor Stinner8faf8212011-12-08 22:14:11 +010086/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
87#define MAX_UNICODE 0x10ffff
88
Victor Stinner910337b2011-10-03 03:20:16 +020089#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020090# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020091#else
92# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
93#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020094
Victor Stinnere90fe6a2011-10-01 16:48:13 +020095#define _PyUnicode_UTF8(op) \
96 (((PyCompactUnicodeObject*)(op))->utf8)
97#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020098 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020099 assert(PyUnicode_IS_READY(op)), \
100 PyUnicode_IS_COMPACT_ASCII(op) ? \
101 ((char*)((PyASCIIObject*)(op) + 1)) : \
102 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200103#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 (((PyCompactUnicodeObject*)(op))->utf8_length)
105#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((PyASCIIObject*)(op))->length : \
110 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_WSTR(op) \
112 (((PyASCIIObject*)(op))->wstr)
113#define _PyUnicode_WSTR_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->wstr_length)
115#define _PyUnicode_LENGTH(op) \
116 (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) \
118 (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) \
120 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_KIND(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_DATA_ANY(op) \
128 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129
Victor Stinner910337b2011-10-03 03:20:16 +0200130#undef PyUnicode_READY
131#define PyUnicode_READY(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200134 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100135 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200136
Victor Stinnerc379ead2011-10-03 12:52:27 +0200137#define _PyUnicode_SHARE_UTF8(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
140 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
141#define _PyUnicode_SHARE_WSTR(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
144
Victor Stinner829c0ad2011-10-03 01:08:02 +0200145/* true if the Unicode object has an allocated UTF-8 memory block
146 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200148 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200149 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200150 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
151
Victor Stinner03490912011-10-03 23:45:12 +0200152/* true if the Unicode object has an allocated wstr memory block
153 (not shared with other data) */
154#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200156 (!PyUnicode_IS_READY(op) || \
157 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
158
Victor Stinner910337b2011-10-03 03:20:16 +0200159/* Generic helper macro to convert characters of different types.
160 from_type and to_type have to be valid type names, begin and end
161 are pointers to the source characters which should be of type
162 "from_type *". to is a pointer of type "to_type *" and points to the
163 buffer where the result characters are written to. */
164#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
165 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100166 to_type *_to = (to_type *)(to); \
167 const from_type *_iter = (from_type *)(begin); \
168 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 Py_ssize_t n = (_end) - (_iter); \
170 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200171 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200172 while (_iter < (_unrolled_end)) { \
173 _to[0] = (to_type) _iter[0]; \
174 _to[1] = (to_type) _iter[1]; \
175 _to[2] = (to_type) _iter[2]; \
176 _to[3] = (to_type) _iter[3]; \
177 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_end)) \
180 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200181 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200182
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200183#ifdef MS_WINDOWS
184 /* On Windows, overallocate by 50% is the best factor */
185# define OVERALLOCATE_FACTOR 2
186#else
187 /* On Linux, overallocate by 25% is the best factor */
188# define OVERALLOCATE_FACTOR 4
189#endif
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205 do { \
206 if (unicode_empty != NULL) \
207 Py_INCREF(unicode_empty); \
208 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 unicode_empty = PyUnicode_New(0, 0); \
210 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
213 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217#define _Py_RETURN_UNICODE_EMPTY() \
218 do { \
219 _Py_INCREF_UNICODE_EMPTY(); \
220 return unicode_empty; \
221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinner59423e32018-11-26 13:40:01 +0100223static inline void
224unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
225 Py_ssize_t start, Py_ssize_t length)
226{
227 assert(0 <= start);
228 assert(kind != PyUnicode_WCHAR_KIND);
229 switch (kind) {
230 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100231 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100232 Py_UCS1 ch = (unsigned char)value;
233 Py_UCS1 *to = (Py_UCS1 *)data + start;
234 memset(to, ch, length);
235 break;
236 }
237 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS2 ch = (Py_UCS2)value;
240 Py_UCS2 *to = (Py_UCS2 *)data + start;
241 const Py_UCS2 *end = to + length;
242 for (; to < end; ++to) *to = ch;
243 break;
244 }
245 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS4 ch = value;
248 Py_UCS4 * to = (Py_UCS4 *)data + start;
249 const Py_UCS4 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 default: Py_UNREACHABLE();
254 }
255}
256
257
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200258/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700259static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200260_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
261
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200262/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265/* Single character Unicode strings in the Latin-1 range are being
266 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200267static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268
Christian Heimes190d79e2008-01-30 11:58:22 +0000269/* Fast detection of the most frequent whitespace characters */
270const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000273/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000274/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x000C: * FORM FEED */
276/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 1, 1, 1, 1, 1, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x001C: * FILE SEPARATOR */
280/* case 0x001D: * GROUP SEPARATOR */
281/* case 0x001E: * RECORD SEPARATOR */
282/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000284/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 1, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000289
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000298};
299
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200300/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200302static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100303static int unicode_modifiable(PyObject *unicode);
304
Victor Stinnerfe226c02011-10-03 03:52:20 +0200305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100307_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200308static PyObject *
309_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
310static PyObject *
311_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
312
313static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000315 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100316 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000317 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
318
Alexander Belopolsky40018472011-02-26 01:02:56 +0000319static void
320raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300321 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100322 PyObject *unicode,
323 Py_ssize_t startpos, Py_ssize_t endpos,
324 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000325
Christian Heimes190d79e2008-01-30 11:58:22 +0000326/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200327static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000329/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000330/* 0x000B, * LINE TABULATION */
331/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000333 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x001C, * FILE SEPARATOR */
336/* 0x001D, * GROUP SEPARATOR */
337/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 1, 1, 1, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000343
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000352};
353
INADA Naoki3ae20562017-01-16 20:41:20 +0900354static int convert_uc(PyObject *obj, void *addr);
355
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300356#include "clinic/unicodeobject.c.h"
357
Victor Stinner3d4226a2018-08-29 22:21:32 +0200358_Py_error_handler
359_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200403#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
404
Victor Stinner910337b2011-10-03 03:20:16 +0200405 PyASCIIObject *ascii;
406 unsigned int kind;
407
Victor Stinner50fe3f82018-10-26 18:47:15 +0200408 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200409
410 ascii = (PyASCIIObject *)op;
411 kind = ascii->state.kind;
412
Victor Stinnera3b334d2011-10-03 13:53:37 +0200413 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200414 ASSERT(kind == PyUnicode_1BYTE_KIND);
415 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200416 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200418 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200419 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200420
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 if (ascii->state.compact == 1) {
422 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200423 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200424 || kind == PyUnicode_2BYTE_KIND
425 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200426 ASSERT(ascii->state.ascii == 0);
427 ASSERT(ascii->state.ready == 1);
428 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100429 }
430 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
432
433 data = unicode->data.any;
434 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->length == 0);
436 ASSERT(ascii->hash == -1);
437 ASSERT(ascii->state.compact == 0);
438 ASSERT(ascii->state.ascii == 0);
439 ASSERT(ascii->state.ready == 0);
440 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
441 ASSERT(ascii->wstr != NULL);
442 ASSERT(data == NULL);
443 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200444 }
445 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200446 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 || kind == PyUnicode_2BYTE_KIND
448 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200449 ASSERT(ascii->state.compact == 0);
450 ASSERT(ascii->state.ready == 1);
451 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200452 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200453 ASSERT (compact->utf8 == data);
454 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 }
456 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200457 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 }
460 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 if (
462#if SIZEOF_WCHAR_T == 2
463 kind == PyUnicode_2BYTE_KIND
464#else
465 kind == PyUnicode_4BYTE_KIND
466#endif
467 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200468 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200469 ASSERT(ascii->wstr == data);
470 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200472 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200473 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200474
475 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200476 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200477 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200478 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 /* check that the best kind is used */
481 if (check_content && kind != PyUnicode_WCHAR_KIND)
482 {
483 Py_ssize_t i;
484 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200485 void *data;
486 Py_UCS4 ch;
487
488 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200489 for (i=0; i < ascii->length; i++)
490 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 if (ch > maxchar)
493 maxchar = ch;
494 }
495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100496 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200497 ASSERT(maxchar >= 128);
498 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100499 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200500 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200501 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 }
Victor Stinner77faf692011-11-20 18:56:05 +0100503 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200504 ASSERT(maxchar >= 0x100);
505 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100506 }
507 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200508 ASSERT(maxchar >= 0x10000);
509 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100510 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200511 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200512 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400513 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200514
515#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516}
Victor Stinner910337b2011-10-03 03:20:16 +0200517#endif
518
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519static PyObject*
520unicode_result_wchar(PyObject *unicode)
521{
522#ifndef Py_DEBUG
523 Py_ssize_t len;
524
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 len = _PyUnicode_WSTR_LENGTH(unicode);
526 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200528 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 }
530
531 if (len == 1) {
532 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100533 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
535 Py_DECREF(unicode);
536 return latin1_char;
537 }
538 }
539
540 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200541 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 return NULL;
543 }
544#else
Victor Stinneraa771272012-10-04 02:32:58 +0200545 assert(Py_REFCNT(unicode) == 1);
546
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100547 /* don't make the result ready in debug mode to ensure that the caller
548 makes the string ready before using it */
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550#endif
551 return unicode;
552}
553
554static PyObject*
555unicode_result_ready(PyObject *unicode)
556{
557 Py_ssize_t length;
558
559 length = PyUnicode_GET_LENGTH(unicode);
560 if (length == 0) {
561 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100562 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200563 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100564 }
565 return unicode_empty;
566 }
567
568 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200569 void *data = PyUnicode_DATA(unicode);
570 int kind = PyUnicode_KIND(unicode);
571 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 if (ch < 256) {
573 PyObject *latin1_char = unicode_latin1[ch];
574 if (latin1_char != NULL) {
575 if (unicode != latin1_char) {
576 Py_INCREF(latin1_char);
577 Py_DECREF(unicode);
578 }
579 return latin1_char;
580 }
581 else {
582 assert(_PyUnicode_CheckConsistency(unicode, 1));
583 Py_INCREF(unicode);
584 unicode_latin1[ch] = unicode;
585 return unicode;
586 }
587 }
588 }
589
590 assert(_PyUnicode_CheckConsistency(unicode, 1));
591 return unicode;
592}
593
594static PyObject*
595unicode_result(PyObject *unicode)
596{
597 assert(_PyUnicode_CHECK(unicode));
598 if (PyUnicode_IS_READY(unicode))
599 return unicode_result_ready(unicode);
600 else
601 return unicode_result_wchar(unicode);
602}
603
Victor Stinnerc4b49542011-12-11 22:44:26 +0100604static PyObject*
605unicode_result_unchanged(PyObject *unicode)
606{
607 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500608 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100609 return NULL;
610 Py_INCREF(unicode);
611 return unicode;
612 }
613 else
614 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100615 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100616}
617
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
619 ASCII, Latin1, UTF-8, etc. */
620static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200621backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
623{
Victor Stinnerad771582015-10-09 12:38:53 +0200624 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 Py_UCS4 ch;
626 enum PyUnicode_Kind kind;
627 void *data;
628
629 assert(PyUnicode_IS_READY(unicode));
630 kind = PyUnicode_KIND(unicode);
631 data = PyUnicode_DATA(unicode);
632
633 size = 0;
634 /* determine replacement size */
635 for (i = collstart; i < collend; ++i) {
636 Py_ssize_t incr;
637
638 ch = PyUnicode_READ(kind, data, i);
639 if (ch < 0x100)
640 incr = 2+2;
641 else if (ch < 0x10000)
642 incr = 2+4;
643 else {
644 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200645 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 }
647 if (size > PY_SSIZE_T_MAX - incr) {
648 PyErr_SetString(PyExc_OverflowError,
649 "encoded result is too long for a Python string");
650 return NULL;
651 }
652 size += incr;
653 }
654
Victor Stinnerad771582015-10-09 12:38:53 +0200655 str = _PyBytesWriter_Prepare(writer, str, size);
656 if (str == NULL)
657 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658
659 /* generate replacement */
660 for (i = collstart; i < collend; ++i) {
661 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200662 *str++ = '\\';
663 if (ch >= 0x00010000) {
664 *str++ = 'U';
665 *str++ = Py_hexdigits[(ch>>28)&0xf];
666 *str++ = Py_hexdigits[(ch>>24)&0xf];
667 *str++ = Py_hexdigits[(ch>>20)&0xf];
668 *str++ = Py_hexdigits[(ch>>16)&0xf];
669 *str++ = Py_hexdigits[(ch>>12)&0xf];
670 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200671 }
Victor Stinner797485e2015-10-09 03:17:30 +0200672 else if (ch >= 0x100) {
673 *str++ = 'u';
674 *str++ = Py_hexdigits[(ch>>12)&0xf];
675 *str++ = Py_hexdigits[(ch>>8)&0xf];
676 }
677 else
678 *str++ = 'x';
679 *str++ = Py_hexdigits[(ch>>4)&0xf];
680 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200681 }
682 return str;
683}
684
685/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
686 ASCII, Latin1, UTF-8, etc. */
687static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200688xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200689 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
690{
Victor Stinnerad771582015-10-09 12:38:53 +0200691 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692 Py_UCS4 ch;
693 enum PyUnicode_Kind kind;
694 void *data;
695
696 assert(PyUnicode_IS_READY(unicode));
697 kind = PyUnicode_KIND(unicode);
698 data = PyUnicode_DATA(unicode);
699
700 size = 0;
701 /* determine replacement size */
702 for (i = collstart; i < collend; ++i) {
703 Py_ssize_t incr;
704
705 ch = PyUnicode_READ(kind, data, i);
706 if (ch < 10)
707 incr = 2+1+1;
708 else if (ch < 100)
709 incr = 2+2+1;
710 else if (ch < 1000)
711 incr = 2+3+1;
712 else if (ch < 10000)
713 incr = 2+4+1;
714 else if (ch < 100000)
715 incr = 2+5+1;
716 else if (ch < 1000000)
717 incr = 2+6+1;
718 else {
719 assert(ch <= MAX_UNICODE);
720 incr = 2+7+1;
721 }
722 if (size > PY_SSIZE_T_MAX - incr) {
723 PyErr_SetString(PyExc_OverflowError,
724 "encoded result is too long for a Python string");
725 return NULL;
726 }
727 size += incr;
728 }
729
Victor Stinnerad771582015-10-09 12:38:53 +0200730 str = _PyBytesWriter_Prepare(writer, str, size);
731 if (str == NULL)
732 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 /* generate replacement */
735 for (i = collstart; i < collend; ++i) {
736 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
737 }
738 return str;
739}
740
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741/* --- Bloom Filters ----------------------------------------------------- */
742
743/* stuff to implement simple "bloom filters" for Unicode characters.
744 to keep things simple, we use a single bitmask, using the least 5
745 bits from each unicode characters as the bit index. */
746
747/* the linebreak mask is set up by Unicode_Init below */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749#if LONG_BIT >= 128
750#define BLOOM_WIDTH 128
751#elif LONG_BIT >= 64
752#define BLOOM_WIDTH 64
753#elif LONG_BIT >= 32
754#define BLOOM_WIDTH 32
755#else
756#error "LONG_BIT is smaller than 32"
757#endif
758
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759#define BLOOM_MASK unsigned long
760
Serhiy Storchaka05997252013-01-26 12:14:02 +0200761static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000762
Antoine Pitrouf068f942010-01-13 14:19:12 +0000763#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson29060642009-01-31 22:14:21 +0000765#define BLOOM_LINEBREAK(ch) \
766 ((ch) < 128U ? ascii_linebreak[(ch)] : \
767 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700769static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000771{
Victor Stinnera85af502013-04-09 21:53:54 +0200772#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
773 do { \
774 TYPE *data = (TYPE *)PTR; \
775 TYPE *end = data + LEN; \
776 Py_UCS4 ch; \
777 for (; data != end; data++) { \
778 ch = *data; \
779 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
780 } \
781 break; \
782 } while (0)
783
Thomas Wouters477c8d52006-05-27 19:21:47 +0000784 /* calculate simple bloom-style bitmask for a given unicode string */
785
Antoine Pitrouf068f942010-01-13 14:19:12 +0000786 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000787
788 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200789 switch (kind) {
790 case PyUnicode_1BYTE_KIND:
791 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
792 break;
793 case PyUnicode_2BYTE_KIND:
794 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
795 break;
796 case PyUnicode_4BYTE_KIND:
797 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
798 break;
799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700800 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200801 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200803
804#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000805}
806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300807static int
808ensure_unicode(PyObject *obj)
809{
810 if (!PyUnicode_Check(obj)) {
811 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200812 "must be str, not %.100s",
813 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300814 return -1;
815 }
816 return PyUnicode_READY(obj);
817}
818
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819/* Compilation of templated routines */
820
821#include "stringlib/asciilib.h"
822#include "stringlib/fastsearch.h"
823#include "stringlib/partition.h"
824#include "stringlib/split.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
827#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200828#include "stringlib/undef.h"
829
830#include "stringlib/ucs1lib.h"
831#include "stringlib/fastsearch.h"
832#include "stringlib/partition.h"
833#include "stringlib/split.h"
834#include "stringlib/count.h"
835#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300836#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200837#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/undef.h"
839
840#include "stringlib/ucs2lib.h"
841#include "stringlib/fastsearch.h"
842#include "stringlib/partition.h"
843#include "stringlib/split.h"
844#include "stringlib/count.h"
845#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300846#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200847#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/undef.h"
849
850#include "stringlib/ucs4lib.h"
851#include "stringlib/fastsearch.h"
852#include "stringlib/partition.h"
853#include "stringlib/split.h"
854#include "stringlib/count.h"
855#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300856#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/undef.h"
859
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860#include "stringlib/unicodedefs.h"
861#include "stringlib/fastsearch.h"
862#include "stringlib/count.h"
863#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100864#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866/* --- Unicode Object ----------------------------------------------------- */
867
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700868static inline Py_ssize_t
869findchar(const void *s, int kind,
870 Py_ssize_t size, Py_UCS4 ch,
871 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200873 switch (kind) {
874 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200875 if ((Py_UCS1) ch != ch)
876 return -1;
877 if (direction > 0)
878 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
879 else
880 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200881 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200882 if ((Py_UCS2) ch != ch)
883 return -1;
884 if (direction > 0)
885 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
886 else
887 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200888 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200889 if (direction > 0)
890 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
891 else
892 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896}
897
Victor Stinnerafffce42012-10-03 23:03:17 +0200898#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000899/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200900 earlier.
901
902 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
903 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
904 invalid character in Unicode 6.0. */
905static void
906unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
907{
908 int kind = PyUnicode_KIND(unicode);
909 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
910 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
911 if (length <= old_length)
912 return;
913 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
914}
915#endif
916
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917static PyObject*
918resize_compact(PyObject *unicode, Py_ssize_t length)
919{
920 Py_ssize_t char_size;
921 Py_ssize_t struct_size;
922 Py_ssize_t new_size;
923 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100924 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200925#ifdef Py_DEBUG
926 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
927#endif
928
Victor Stinner79891572012-05-03 13:43:07 +0200929 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100931 assert(PyUnicode_IS_COMPACT(unicode));
932
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200933 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100934 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 struct_size = sizeof(PyASCIIObject);
936 else
937 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
941 PyErr_NoMemory();
942 return NULL;
943 }
944 new_size = (struct_size + (length + 1) * char_size);
945
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200946 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
947 PyObject_DEL(_PyUnicode_UTF8(unicode));
948 _PyUnicode_UTF8(unicode) = NULL;
949 _PyUnicode_UTF8_LENGTH(unicode) = 0;
950 }
Victor Stinner84def372011-12-11 20:04:56 +0100951 _Py_DEC_REFTOTAL;
952 _Py_ForgetReference(unicode);
953
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300954 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100955 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100956 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyErr_NoMemory();
958 return NULL;
959 }
Victor Stinner84def372011-12-11 20:04:56 +0100960 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100962
Victor Stinnerfe226c02011-10-03 03:52:20 +0200963 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100966 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200967 _PyUnicode_WSTR_LENGTH(unicode) = length;
968 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100969 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
970 PyObject_DEL(_PyUnicode_WSTR(unicode));
971 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100972 if (!PyUnicode_IS_ASCII(unicode))
973 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100974 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 unicode_fill_invalid(unicode, old_length);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
979 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200980 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 return unicode;
982}
983
Alexander Belopolsky40018472011-02-26 01:02:56 +0000984static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200985resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986{
Victor Stinner95663112011-10-04 01:03:50 +0200987 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100988 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200990 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000991
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 if (PyUnicode_IS_READY(unicode)) {
993 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200994 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
998#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999
1000 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001001 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1003 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004
1005 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1006 PyErr_NoMemory();
1007 return -1;
1008 }
1009 new_size = (length + 1) * char_size;
1010
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1012 {
1013 PyObject_DEL(_PyUnicode_UTF8(unicode));
1014 _PyUnicode_UTF8(unicode) = NULL;
1015 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1016 }
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 data = (PyObject *)PyObject_REALLOC(data, new_size);
1019 if (data == NULL) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001024 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_WSTR_LENGTH(unicode) = length;
1027 }
1028 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001029 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_UTF8_LENGTH(unicode) = length;
1031 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 _PyUnicode_LENGTH(unicode) = length;
1033 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 unicode_fill_invalid(unicode, old_length);
1036#endif
Victor Stinner95663112011-10-04 01:03:50 +02001037 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinner95663112011-10-04 01:03:50 +02001042 assert(_PyUnicode_WSTR(unicode) != NULL);
1043
1044 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001045 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001046 PyErr_NoMemory();
1047 return -1;
1048 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001050 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001052 if (!wstr) {
1053 PyErr_NoMemory();
1054 return -1;
1055 }
1056 _PyUnicode_WSTR(unicode) = wstr;
1057 _PyUnicode_WSTR(unicode)[length] = 0;
1058 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001059 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return 0;
1061}
1062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063static PyObject*
1064resize_copy(PyObject *unicode, Py_ssize_t length)
1065{
1066 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001067 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001069
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001070 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071
1072 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1073 if (copy == NULL)
1074 return NULL;
1075
1076 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001077 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001079 }
1080 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001081 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001082
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001083 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 if (w == NULL)
1085 return NULL;
1086 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1087 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001088 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001089 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 }
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001095 Ux0000 terminated; some code (e.g. new_identifier)
1096 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100
1101*/
1102
Alexander Belopolsky40018472011-02-26 01:02:56 +00001103static PyUnicodeObject *
1104_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108
Thomas Wouters477c8d52006-05-27 19:21:47 +00001109 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (length == 0 && unicode_empty != NULL) {
1111 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001112 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 }
1114
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001115 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001116 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001117 return (PyUnicodeObject *)PyErr_NoMemory();
1118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 if (length < 0) {
1120 PyErr_SetString(PyExc_SystemError,
1121 "Negative size passed to _PyUnicode_New");
1122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1126 if (unicode == NULL)
1127 return NULL;
1128 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001129
1130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 _PyUnicode_HASH(unicode) = -1;
1132 _PyUnicode_STATE(unicode).interned = 0;
1133 _PyUnicode_STATE(unicode).kind = 0;
1134 _PyUnicode_STATE(unicode).compact = 0;
1135 _PyUnicode_STATE(unicode).ready = 0;
1136 _PyUnicode_STATE(unicode).ascii = 0;
1137 _PyUnicode_DATA_ANY(unicode) = NULL;
1138 _PyUnicode_LENGTH(unicode) = 0;
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1143 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001144 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001145 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001146 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
Jeremy Hyltond8082792003-09-16 19:41:39 +00001149 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001150 * the caller fails before initializing str -- unicode_resize()
1151 * reads str[0], and the Keep-Alive optimization can keep memory
1152 * allocated for str alive across a call to unicode_dealloc(unicode).
1153 * We don't want unicode_resize to read uninitialized memory in
1154 * that case.
1155 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_WSTR(unicode)[0] = 0;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001158
Victor Stinner7931d9a2011-11-04 00:22:48 +01001159 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return unicode;
1161}
1162
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163static const char*
1164unicode_kind_name(PyObject *unicode)
1165{
Victor Stinner42dfd712011-10-03 14:41:45 +02001166 /* don't check consistency: unicode_kind_name() is called from
1167 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 if (!PyUnicode_IS_COMPACT(unicode))
1169 {
1170 if (!PyUnicode_IS_READY(unicode))
1171 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001172 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 {
1174 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 return "legacy ascii";
1177 else
1178 return "legacy latin1";
1179 case PyUnicode_2BYTE_KIND:
1180 return "legacy UCS2";
1181 case PyUnicode_4BYTE_KIND:
1182 return "legacy UCS4";
1183 default:
1184 return "<legacy invalid kind>";
1185 }
1186 }
1187 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001188 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001189 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001190 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001191 return "ascii";
1192 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001195 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 default:
1199 return "<invalid compact kind>";
1200 }
1201}
1202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001205char *_PyUnicode_utf8(void *unicode_raw){
1206 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001207 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208}
1209
Victor Stinnera42de742018-11-22 10:25:22 +01001210void *_PyUnicode_compact_data(void *unicode_raw) {
1211 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 return _PyUnicode_COMPACT_DATA(unicode);
1213}
Victor Stinnera42de742018-11-22 10:25:22 +01001214void *_PyUnicode_data(void *unicode_raw) {
1215 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 printf("obj %p\n", unicode);
1217 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1218 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1219 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1220 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1221 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1222 return PyUnicode_DATA(unicode);
1223}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001224
1225void
1226_PyUnicode_Dump(PyObject *op)
1227{
1228 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001229 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1230 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1231 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001232
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001234 {
1235 if (ascii->state.ascii)
1236 data = (ascii + 1);
1237 else
1238 data = (compact + 1);
1239 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001240 else
1241 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001242 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1243 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001244
Victor Stinnera849a4b2011-10-03 12:12:11 +02001245 if (ascii->wstr == data)
1246 printf("shared ");
1247 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001248
Victor Stinnera3b334d2011-10-03 13:53:37 +02001249 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001250 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001251 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1252 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001253 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1254 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001255 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001256 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001257}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258#endif
1259
1260PyObject *
1261PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1262{
1263 PyObject *obj;
1264 PyCompactUnicodeObject *unicode;
1265 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001266 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001267 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 Py_ssize_t char_size;
1269 Py_ssize_t struct_size;
1270
1271 /* Optimization for empty strings */
1272 if (size == 0 && unicode_empty != NULL) {
1273 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 }
1276
Victor Stinner9e9d6892011-10-04 01:02:02 +02001277 is_ascii = 0;
1278 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 struct_size = sizeof(PyCompactUnicodeObject);
1280 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001281 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 char_size = 1;
1283 is_ascii = 1;
1284 struct_size = sizeof(PyASCIIObject);
1285 }
1286 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001287 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 char_size = 1;
1289 }
1290 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001291 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 char_size = 2;
1293 if (sizeof(wchar_t) == 2)
1294 is_sharing = 1;
1295 }
1296 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001297 if (maxchar > MAX_UNICODE) {
1298 PyErr_SetString(PyExc_SystemError,
1299 "invalid maximum character passed to PyUnicode_New");
1300 return NULL;
1301 }
Victor Stinner8f825062012-04-27 13:55:39 +02001302 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 char_size = 4;
1304 if (sizeof(wchar_t) == 4)
1305 is_sharing = 1;
1306 }
1307
1308 /* Ensure we won't overflow the size. */
1309 if (size < 0) {
1310 PyErr_SetString(PyExc_SystemError,
1311 "Negative size passed to PyUnicode_New");
1312 return NULL;
1313 }
1314 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1315 return PyErr_NoMemory();
1316
1317 /* Duplicated allocation code from _PyObject_New() instead of a call to
1318 * PyObject_New() so we are able to allocate space for the object and
1319 * it's data buffer.
1320 */
1321 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1322 if (obj == NULL)
1323 return PyErr_NoMemory();
1324 obj = PyObject_INIT(obj, &PyUnicode_Type);
1325 if (obj == NULL)
1326 return NULL;
1327
1328 unicode = (PyCompactUnicodeObject *)obj;
1329 if (is_ascii)
1330 data = ((PyASCIIObject*)obj) + 1;
1331 else
1332 data = unicode + 1;
1333 _PyUnicode_LENGTH(unicode) = size;
1334 _PyUnicode_HASH(unicode) = -1;
1335 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001336 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).compact = 1;
1338 _PyUnicode_STATE(unicode).ready = 1;
1339 _PyUnicode_STATE(unicode).ascii = is_ascii;
1340 if (is_ascii) {
1341 ((char*)data)[size] = 0;
1342 _PyUnicode_WSTR(unicode) = NULL;
1343 }
Victor Stinner8f825062012-04-27 13:55:39 +02001344 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ((char*)data)[size] = 0;
1346 _PyUnicode_WSTR(unicode) = NULL;
1347 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 else {
1352 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001353 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001356 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 ((Py_UCS4*)data)[size] = 0;
1358 if (is_sharing) {
1359 _PyUnicode_WSTR_LENGTH(unicode) = size;
1360 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1361 }
1362 else {
1363 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1364 _PyUnicode_WSTR(unicode) = NULL;
1365 }
1366 }
Victor Stinner8f825062012-04-27 13:55:39 +02001367#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001368 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001369#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001370 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return obj;
1372}
1373
1374#if SIZEOF_WCHAR_T == 2
1375/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1376 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001377 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378
1379 This function assumes that unicode can hold one more code point than wstr
1380 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001381static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001383 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384{
1385 const wchar_t *iter;
1386 Py_UCS4 *ucs4_out;
1387
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(unicode != NULL);
1389 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1391 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1392
1393 for (iter = begin; iter < end; ) {
1394 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1395 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001396 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1397 && (iter+1) < end
1398 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 {
Victor Stinner551ac952011-11-29 22:58:13 +01001400 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 iter += 2;
1402 }
1403 else {
1404 *ucs4_out++ = *iter;
1405 iter++;
1406 }
1407 }
1408 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1409 _PyUnicode_GET_LENGTH(unicode)));
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411}
1412#endif
1413
Victor Stinnercd9950f2011-10-02 00:34:53 +02001414static int
Victor Stinner488fa492011-12-12 00:01:39 +01001415unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001416{
Victor Stinner488fa492011-12-12 00:01:39 +01001417 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001418 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001419 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001420 return -1;
1421 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001422 return 0;
1423}
1424
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001425static int
1426_copy_characters(PyObject *to, Py_ssize_t to_start,
1427 PyObject *from, Py_ssize_t from_start,
1428 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001430 unsigned int from_kind, to_kind;
1431 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinneree4544c2012-05-09 22:24:08 +02001433 assert(0 <= how_many);
1434 assert(0 <= from_start);
1435 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001436 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001438 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439
Victor Stinnerd3f08822012-05-29 12:57:52 +02001440 assert(PyUnicode_Check(to));
1441 assert(PyUnicode_IS_READY(to));
1442 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1443
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001444 if (how_many == 0)
1445 return 0;
1446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001448 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001450 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451
Victor Stinnerf1852262012-06-16 16:38:26 +02001452#ifdef Py_DEBUG
1453 if (!check_maxchar
1454 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455 {
1456 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457 Py_UCS4 ch;
1458 Py_ssize_t i;
1459 for (i=0; i < how_many; i++) {
1460 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461 assert(ch <= to_maxchar);
1462 }
1463 }
1464#endif
1465
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001466 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001467 if (check_maxchar
1468 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 /* Writing Latin-1 characters into an ASCII string requires to
1471 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001472 Py_UCS4 max_char;
1473 max_char = ucs1lib_find_max_char(from_data,
1474 (Py_UCS1*)from_data + how_many);
1475 if (max_char >= 128)
1476 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 }
Christian Heimesf051e432016-09-13 20:22:02 +02001478 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001479 (char*)from_data + from_kind * from_start,
1480 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001482 else if (from_kind == PyUnicode_1BYTE_KIND
1483 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 {
1485 _PyUnicode_CONVERT_BYTES(
1486 Py_UCS1, Py_UCS2,
1487 PyUnicode_1BYTE_DATA(from) + from_start,
1488 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489 PyUnicode_2BYTE_DATA(to) + to_start
1490 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001491 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001492 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001493 && to_kind == PyUnicode_4BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS1, Py_UCS4,
1497 PyUnicode_1BYTE_DATA(from) + from_start,
1498 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_4BYTE_DATA(to) + to_start
1500 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001501 }
1502 else if (from_kind == PyUnicode_2BYTE_KIND
1503 && to_kind == PyUnicode_4BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS2, Py_UCS4,
1507 PyUnicode_2BYTE_DATA(from) + from_start,
1508 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_4BYTE_DATA(to) + to_start
1510 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001511 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001513 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 if (!check_maxchar) {
1516 if (from_kind == PyUnicode_2BYTE_KIND
1517 && to_kind == PyUnicode_1BYTE_KIND)
1518 {
1519 _PyUnicode_CONVERT_BYTES(
1520 Py_UCS2, Py_UCS1,
1521 PyUnicode_2BYTE_DATA(from) + from_start,
1522 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523 PyUnicode_1BYTE_DATA(to) + to_start
1524 );
1525 }
1526 else if (from_kind == PyUnicode_4BYTE_KIND
1527 && to_kind == PyUnicode_1BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS4, Py_UCS1,
1531 PyUnicode_4BYTE_DATA(from) + from_start,
1532 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_1BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_4BYTE_KIND
1537 && to_kind == PyUnicode_2BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS4, Py_UCS2,
1541 PyUnicode_4BYTE_DATA(from) + from_start,
1542 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_2BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001547 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 }
1549 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001550 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001553 Py_ssize_t i;
1554
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 for (i=0; i < how_many; i++) {
1556 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 if (ch > to_maxchar)
1558 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001559 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001561 }
1562 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return 0;
1564}
1565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566void
1567_PyUnicode_FastCopyCharacters(
1568 PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570{
1571 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572}
1573
1574Py_ssize_t
1575PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576 PyObject *from, Py_ssize_t from_start,
1577 Py_ssize_t how_many)
1578{
1579 int err;
1580
1581 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582 PyErr_BadInternalCall();
1583 return -1;
1584 }
1585
Benjamin Petersonbac79492012-01-14 13:34:47 -05001586 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001587 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001588 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001589 return -1;
1590
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001591 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001592 PyErr_SetString(PyExc_IndexError, "string index out of range");
1593 return -1;
1594 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001595 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001596 PyErr_SetString(PyExc_IndexError, "string index out of range");
1597 return -1;
1598 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001599 if (how_many < 0) {
1600 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1601 return -1;
1602 }
1603 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001604 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1605 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001606 "Cannot write %zi characters at %zi "
1607 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608 how_many, to_start, PyUnicode_GET_LENGTH(to));
1609 return -1;
1610 }
1611
1612 if (how_many == 0)
1613 return 0;
1614
Victor Stinner488fa492011-12-12 00:01:39 +01001615 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001616 return -1;
1617
1618 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1619 if (err) {
1620 PyErr_Format(PyExc_SystemError,
1621 "Cannot copy %s characters "
1622 "into a string of %s characters",
1623 unicode_kind_name(from),
1624 unicode_kind_name(to));
1625 return -1;
1626 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001627 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628}
1629
Victor Stinner17222162011-09-28 22:15:37 +02001630/* Find the maximum code point and count the number of surrogate pairs so a
1631 correct string length can be computed before converting a string to UCS4.
1632 This function counts single surrogates as a character and not as a pair.
1633
1634 Return 0 on success, or -1 on error. */
1635static int
1636find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1637 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638{
1639 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001640 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641
Victor Stinnerc53be962011-10-02 21:33:54 +02001642 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 *num_surrogates = 0;
1644 *maxchar = 0;
1645
1646 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001648 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1649 && (iter+1) < end
1650 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1651 {
1652 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1653 ++(*num_surrogates);
1654 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 }
1656 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001658 {
1659 ch = *iter;
1660 iter++;
1661 }
1662 if (ch > *maxchar) {
1663 *maxchar = ch;
1664 if (*maxchar > MAX_UNICODE) {
1665 PyErr_Format(PyExc_ValueError,
1666 "character U+%x is not in range [U+0000; U+10ffff]",
1667 ch);
1668 return -1;
1669 }
1670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 }
1672 return 0;
1673}
1674
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001675int
1676_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677{
1678 wchar_t *end;
1679 Py_UCS4 maxchar = 0;
1680 Py_ssize_t num_surrogates;
1681#if SIZEOF_WCHAR_T == 2
1682 Py_ssize_t length_wo_surrogates;
1683#endif
1684
Georg Brandl7597add2011-10-05 16:36:47 +02001685 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001686 strings were created using _PyObject_New() and where no canonical
1687 representation (the str field) has been set yet aka strings
1688 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001689 assert(_PyUnicode_CHECK(unicode));
1690 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001694 /* Actually, it should neither be interned nor be anything else: */
1695 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001698 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001699 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
1702 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1704 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 PyErr_NoMemory();
1706 return -1;
1707 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 _PyUnicode_WSTR(unicode), end,
1710 PyUnicode_1BYTE_DATA(unicode));
1711 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1712 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1713 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1714 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001715 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001716 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001720 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001721 _PyUnicode_UTF8(unicode) = NULL;
1722 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 }
1724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727 }
1728 /* In this case we might have to convert down from 4-byte native
1729 wchar_t to 2-byte unicode. */
1730 else if (maxchar < 65536) {
1731 assert(num_surrogates == 0 &&
1732 "FindMaxCharAndNumSurrogatePairs() messed up");
1733
Victor Stinner506f5922011-09-28 22:34:18 +02001734#if SIZEOF_WCHAR_T == 2
1735 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001737 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1738 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001742#else
1743 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001744 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001745 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001746 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001747 PyErr_NoMemory();
1748 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 }
Victor Stinner506f5922011-09-28 22:34:18 +02001750 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1751 _PyUnicode_WSTR(unicode), end,
1752 PyUnicode_2BYTE_DATA(unicode));
1753 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1754 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001758 PyObject_FREE(_PyUnicode_WSTR(unicode));
1759 _PyUnicode_WSTR(unicode) = NULL;
1760 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1764 else {
1765#if SIZEOF_WCHAR_T == 2
1766 /* in case the native representation is 2-bytes, we need to allocate a
1767 new normalized 4-byte version. */
1768 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001769 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1770 PyErr_NoMemory();
1771 return -1;
1772 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001773 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1774 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 PyErr_NoMemory();
1776 return -1;
1777 }
1778 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001782 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1783 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001784 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyObject_FREE(_PyUnicode_WSTR(unicode));
1786 _PyUnicode_WSTR(unicode) = NULL;
1787 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1788#else
1789 assert(num_surrogates == 0);
1790
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 _PyUnicode_UTF8(unicode) = NULL;
1794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1796#endif
1797 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1798 }
1799 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001800 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return 0;
1802}
1803
Alexander Belopolsky40018472011-02-26 01:02:56 +00001804static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001805unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806{
Walter Dörwald16807132007-05-25 13:52:07 +00001807 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 case SSTATE_NOT_INTERNED:
1809 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 case SSTATE_INTERNED_MORTAL:
1812 /* revive dead object temporarily for DelItem */
1813 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001814 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 Py_FatalError(
1816 "deletion of interned string failed");
1817 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 case SSTATE_INTERNED_IMMORTAL:
1820 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001821 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 default:
1824 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001825 }
1826
Victor Stinner03490912011-10-03 23:45:12 +02001827 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001829 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001831 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1832 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001834 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835}
1836
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001837#ifdef Py_DEBUG
1838static int
1839unicode_is_singleton(PyObject *unicode)
1840{
1841 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1842 if (unicode == unicode_empty)
1843 return 1;
1844 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1845 {
1846 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1847 if (ch < 256 && unicode_latin1[ch] == unicode)
1848 return 1;
1849 }
1850 return 0;
1851}
1852#endif
1853
Alexander Belopolsky40018472011-02-26 01:02:56 +00001854static int
Victor Stinner488fa492011-12-12 00:01:39 +01001855unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001856{
Victor Stinner488fa492011-12-12 00:01:39 +01001857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001858 if (Py_REFCNT(unicode) != 1)
1859 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (_PyUnicode_HASH(unicode) != -1)
1861 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 if (PyUnicode_CHECK_INTERNED(unicode))
1863 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001864 if (!PyUnicode_CheckExact(unicode))
1865 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001866#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001867 /* singleton refcount is greater than 1 */
1868 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001869#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001870 return 1;
1871}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873static int
1874unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1875{
1876 PyObject *unicode;
1877 Py_ssize_t old_length;
1878
1879 assert(p_unicode != NULL);
1880 unicode = *p_unicode;
1881
1882 assert(unicode != NULL);
1883 assert(PyUnicode_Check(unicode));
1884 assert(0 <= length);
1885
Victor Stinner910337b2011-10-03 03:20:16 +02001886 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 old_length = PyUnicode_WSTR_LENGTH(unicode);
1888 else
1889 old_length = PyUnicode_GET_LENGTH(unicode);
1890 if (old_length == length)
1891 return 0;
1892
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001893 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001894 _Py_INCREF_UNICODE_EMPTY();
1895 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001897 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001898 return 0;
1899 }
1900
Victor Stinner488fa492011-12-12 00:01:39 +01001901 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 PyObject *copy = resize_copy(unicode, length);
1903 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001905 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001906 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001907 }
1908
Victor Stinnerfe226c02011-10-03 03:52:20 +02001909 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001910 PyObject *new_unicode = resize_compact(unicode, length);
1911 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001912 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001916 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001917}
1918
Alexander Belopolsky40018472011-02-26 01:02:56 +00001919int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001920PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001921{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 PyObject *unicode;
1923 if (p_unicode == NULL) {
1924 PyErr_BadInternalCall();
1925 return -1;
1926 }
1927 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001928 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001929 {
1930 PyErr_BadInternalCall();
1931 return -1;
1932 }
1933 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001934}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001935
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001936/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001937
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001938 WARNING: The function doesn't copy the terminating null character and
1939 doesn't check the maximum character (may write a latin1 character in an
1940 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001941static void
1942unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1943 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944{
1945 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1946 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001947 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948
1949 switch (kind) {
1950 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001951 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001952#ifdef Py_DEBUG
1953 if (PyUnicode_IS_ASCII(unicode)) {
1954 Py_UCS4 maxchar = ucs1lib_find_max_char(
1955 (const Py_UCS1*)str,
1956 (const Py_UCS1*)str + len);
1957 assert(maxchar < 128);
1958 }
1959#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001960 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001961 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 }
1963 case PyUnicode_2BYTE_KIND: {
1964 Py_UCS2 *start = (Py_UCS2 *)data + index;
1965 Py_UCS2 *ucs2 = start;
1966 assert(index <= PyUnicode_GET_LENGTH(unicode));
1967
Victor Stinner184252a2012-06-16 02:57:41 +02001968 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001969 *ucs2 = (Py_UCS2)*str;
1970
1971 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001972 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001973 }
1974 default: {
1975 Py_UCS4 *start = (Py_UCS4 *)data + index;
1976 Py_UCS4 *ucs4 = start;
1977 assert(kind == PyUnicode_4BYTE_KIND);
1978 assert(index <= PyUnicode_GET_LENGTH(unicode));
1979
Victor Stinner184252a2012-06-16 02:57:41 +02001980 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001981 *ucs4 = (Py_UCS4)*str;
1982
1983 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001984 }
1985 }
1986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988static PyObject*
1989get_latin1_char(unsigned char ch)
1990{
Victor Stinnera464fc12011-10-02 20:39:30 +02001991 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001993 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!unicode)
1995 return NULL;
1996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001997 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 unicode_latin1[ch] = unicode;
1999 }
2000 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002001 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinner985a82a2014-01-03 12:53:47 +01002004static PyObject*
2005unicode_char(Py_UCS4 ch)
2006{
2007 PyObject *unicode;
2008
2009 assert(ch <= MAX_UNICODE);
2010
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002011 if (ch < 256)
2012 return get_latin1_char(ch);
2013
Victor Stinner985a82a2014-01-03 12:53:47 +01002014 unicode = PyUnicode_New(1, ch);
2015 if (unicode == NULL)
2016 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002017
2018 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2019 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002021 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002022 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2023 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2024 }
2025 assert(_PyUnicode_CheckConsistency(unicode, 1));
2026 return unicode;
2027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002032 if (u == NULL)
2033 return (PyObject*)_PyUnicode_New(size);
2034
2035 if (size < 0) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
2040 return PyUnicode_FromWideChar(u, size);
2041}
2042
2043PyObject *
2044PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002046 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 Py_UCS4 maxchar = 0;
2048 Py_ssize_t num_surrogates;
2049
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002050 if (u == NULL && size != 0) {
2051 PyErr_BadInternalCall();
2052 return NULL;
2053 }
2054
2055 if (size == -1) {
2056 size = wcslen(u);
2057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 /* If the Unicode data is known at construction time, we can apply
2060 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002063 if (size == 0)
2064 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 /* Single character Unicode objects in the Latin-1 range are
2067 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002068 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 return get_latin1_char((unsigned char)*u);
2070
2071 /* If not empty and not single character, copy the Unicode data
2072 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002073 if (find_maxchar_surrogates(u, u + size,
2074 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 return NULL;
2076
Victor Stinner8faf8212011-12-08 22:14:11 +01002077 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!unicode)
2079 return NULL;
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 switch (PyUnicode_KIND(unicode)) {
2082 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2085 break;
2086 case PyUnicode_2BYTE_KIND:
2087#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002088 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002090 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2092#endif
2093 break;
2094 case PyUnicode_4BYTE_KIND:
2095#if SIZEOF_WCHAR_T == 2
2096 /* This is the only case which has to process surrogates, thus
2097 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002098 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099#else
2100 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002101 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102#endif
2103 break;
2104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002105 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002108 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 if (size < 0) {
2115 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 return NULL;
2118 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002119 if (u != NULL)
2120 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2121 else
2122 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002123}
2124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127{
2128 size_t size = strlen(u);
2129 if (size > PY_SSIZE_T_MAX) {
2130 PyErr_SetString(PyExc_OverflowError, "input too long");
2131 return NULL;
2132 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002133 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134}
2135
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136PyObject *
2137_PyUnicode_FromId(_Py_Identifier *id)
2138{
2139 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002140 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2141 strlen(id->string),
2142 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 if (!id->object)
2144 return NULL;
2145 PyUnicode_InternInPlace(&id->object);
2146 assert(!id->next);
2147 id->next = static_strings;
2148 static_strings = id;
2149 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002150 return id->object;
2151}
2152
2153void
2154_PyUnicode_ClearStaticStrings()
2155{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 _Py_Identifier *tmp, *s = static_strings;
2157 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002158 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002159 tmp = s->next;
2160 s->next = NULL;
2161 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002162 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002163 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002164}
2165
Benjamin Peterson0df54292012-03-26 14:50:32 -04002166/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002167
Victor Stinnerd3f08822012-05-29 12:57:52 +02002168PyObject*
2169_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002170{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002171 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002172 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002173 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002174#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002175 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002176#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002177 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002178 }
Victor Stinner785938e2011-12-11 20:09:03 +01002179 unicode = PyUnicode_New(size, 127);
2180 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002181 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002182 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2183 assert(_PyUnicode_CheckConsistency(unicode, 1));
2184 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002185}
2186
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002187static Py_UCS4
2188kind_maxchar_limit(unsigned int kind)
2189{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002190 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002191 case PyUnicode_1BYTE_KIND:
2192 return 0x80;
2193 case PyUnicode_2BYTE_KIND:
2194 return 0x100;
2195 case PyUnicode_4BYTE_KIND:
2196 return 0x10000;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002199 }
2200}
2201
Victor Stinner702c7342011-10-05 13:50:52 +02002202static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002203_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002207
Serhiy Storchaka678db842013-01-26 12:16:36 +02002208 if (size == 0)
2209 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002210 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002211 if (size == 1)
2212 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002214 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002215 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!res)
2217 return NULL;
2218 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002219 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002221}
2222
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223static PyObject*
2224_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225{
2226 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228
Serhiy Storchaka678db842013-01-26 12:16:36 +02002229 if (size == 0)
2230 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002231 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002232 if (size == 1)
2233 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002234
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002235 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002236 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (!res)
2238 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002239 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002241 else {
2242 _PyUnicode_CONVERT_BYTES(
2243 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2244 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002245 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 return res;
2247}
2248
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249static PyObject*
2250_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251{
2252 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002254
Serhiy Storchaka678db842013-01-26 12:16:36 +02002255 if (size == 0)
2256 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002258 if (size == 1)
2259 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002261 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002262 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (!res)
2264 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 if (max_char < 256)
2266 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2267 PyUnicode_1BYTE_DATA(res));
2268 else if (max_char < 0x10000)
2269 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2270 PyUnicode_2BYTE_DATA(res));
2271 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 return res;
2275}
2276
2277PyObject*
2278PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002280 if (size < 0) {
2281 PyErr_SetString(PyExc_ValueError, "size must be positive");
2282 return NULL;
2283 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002284 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002286 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002288 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002290 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002291 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 PyErr_SetString(PyExc_SystemError, "invalid kind");
2293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295}
2296
Victor Stinnerece58de2012-04-23 23:36:38 +02002297Py_UCS4
2298_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299{
2300 enum PyUnicode_Kind kind;
2301 void *startptr, *endptr;
2302
2303 assert(PyUnicode_IS_READY(unicode));
2304 assert(0 <= start);
2305 assert(end <= PyUnicode_GET_LENGTH(unicode));
2306 assert(start <= end);
2307
2308 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2309 return PyUnicode_MAX_CHAR_VALUE(unicode);
2310
2311 if (start == end)
2312 return 127;
2313
Victor Stinner94d558b2012-04-27 22:26:58 +02002314 if (PyUnicode_IS_ASCII(unicode))
2315 return 127;
2316
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002318 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002319 endptr = (char *)startptr + end * kind;
2320 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002321 switch(kind) {
2322 case PyUnicode_1BYTE_KIND:
2323 return ucs1lib_find_max_char(startptr, endptr);
2324 case PyUnicode_2BYTE_KIND:
2325 return ucs2lib_find_max_char(startptr, endptr);
2326 case PyUnicode_4BYTE_KIND:
2327 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002329 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002330 }
2331}
2332
Victor Stinner25a4b292011-10-06 12:31:55 +02002333/* Ensure that a string uses the most efficient storage, if it is not the
2334 case: create a new string with of the right kind. Write NULL into *p_unicode
2335 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002336static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002337unicode_adjust_maxchar(PyObject **p_unicode)
2338{
2339 PyObject *unicode, *copy;
2340 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002342 unsigned int kind;
2343
2344 assert(p_unicode != NULL);
2345 unicode = *p_unicode;
2346 assert(PyUnicode_IS_READY(unicode));
2347 if (PyUnicode_IS_ASCII(unicode))
2348 return;
2349
2350 len = PyUnicode_GET_LENGTH(unicode);
2351 kind = PyUnicode_KIND(unicode);
2352 if (kind == PyUnicode_1BYTE_KIND) {
2353 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs1lib_find_max_char(u, u + len);
2355 if (max_char >= 128)
2356 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 }
2358 else if (kind == PyUnicode_2BYTE_KIND) {
2359 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs2lib_find_max_char(u, u + len);
2361 if (max_char >= 256)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
2364 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + len);
2368 if (max_char >= 0x10000)
2369 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002370 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002372 if (copy != NULL)
2373 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002374 Py_DECREF(unicode);
2375 *p_unicode = copy;
2376}
2377
Victor Stinner034f6cf2011-09-30 02:26:44 +02002378PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002379_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380{
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner034f6cf2011-09-30 02:26:44 +02002384 if (!PyUnicode_Check(unicode)) {
2385 PyErr_BadInternalCall();
2386 return NULL;
2387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002388 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length = PyUnicode_GET_LENGTH(unicode);
2392 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 if (!copy)
2394 return NULL;
2395 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2396
Christian Heimesf051e432016-09-13 20:22:02 +02002397 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002398 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002399 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002400 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002401}
2402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403
Victor Stinnerbc603d12011-10-02 01:00:40 +02002404/* Widen Unicode objects to larger buffers. Don't write terminating null
2405 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406
2407void*
2408_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2409{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 Py_ssize_t len;
2411 void *result;
2412 unsigned int skind;
2413
Benjamin Petersonbac79492012-01-14 13:34:47 -05002414 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 return NULL;
2416
2417 len = PyUnicode_GET_LENGTH(s);
2418 skind = PyUnicode_KIND(s);
2419 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002420 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002424 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002425 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 if (!result)
2427 return PyErr_NoMemory();
2428 assert(skind == PyUnicode_1BYTE_KIND);
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS1, Py_UCS2,
2431 PyUnicode_1BYTE_DATA(s),
2432 PyUnicode_1BYTE_DATA(s) + len,
2433 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002435 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002436 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 if (!result)
2438 return PyErr_NoMemory();
2439 if (skind == PyUnicode_2BYTE_KIND) {
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS2, Py_UCS4,
2442 PyUnicode_2BYTE_DATA(s),
2443 PyUnicode_2BYTE_DATA(s) + len,
2444 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 else {
2447 assert(skind == PyUnicode_1BYTE_KIND);
2448 _PyUnicode_CONVERT_BYTES(
2449 Py_UCS1, Py_UCS4,
2450 PyUnicode_1BYTE_DATA(s),
2451 PyUnicode_1BYTE_DATA(s) + len,
2452 result);
2453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 default:
2456 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 }
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460}
2461
2462static Py_UCS4*
2463as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2464 int copy_null)
2465{
2466 int kind;
2467 void *data;
2468 Py_ssize_t len, targetlen;
2469 if (PyUnicode_READY(string) == -1)
2470 return NULL;
2471 kind = PyUnicode_KIND(string);
2472 data = PyUnicode_DATA(string);
2473 len = PyUnicode_GET_LENGTH(string);
2474 targetlen = len;
2475 if (copy_null)
2476 targetlen++;
2477 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002478 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (!target) {
2480 PyErr_NoMemory();
2481 return NULL;
2482 }
2483 }
2484 else {
2485 if (targetsize < targetlen) {
2486 PyErr_Format(PyExc_SystemError,
2487 "string is longer than the buffer");
2488 if (copy_null && 0 < targetsize)
2489 target[0] = 0;
2490 return NULL;
2491 }
2492 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (kind == PyUnicode_1BYTE_KIND) {
2494 Py_UCS1 *start = (Py_UCS1 *) data;
2495 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 Py_UCS2 *start = (Py_UCS2 *) data;
2499 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2500 }
2501 else {
2502 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (copy_null)
2506 target[len] = 0;
2507 return target;
2508}
2509
2510Py_UCS4*
2511PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2512 int copy_null)
2513{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002514 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 PyErr_BadInternalCall();
2516 return NULL;
2517 }
2518 return as_ucs4(string, target, targetsize, copy_null);
2519}
2520
2521Py_UCS4*
2522PyUnicode_AsUCS4Copy(PyObject *string)
2523{
2524 return as_ucs4(string, NULL, 0, 1);
2525}
2526
Victor Stinner15a11362012-10-06 23:48:20 +02002527/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002528 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2529 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2530#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002531
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002532static int
2533unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 Py_ssize_t length, fill, arglen;
2537 Py_UCS4 maxchar;
2538
2539 if (PyUnicode_READY(str) == -1)
2540 return -1;
2541
2542 length = PyUnicode_GET_LENGTH(str);
2543 if ((precision == -1 || precision >= length)
2544 && width <= length)
2545 return _PyUnicodeWriter_WriteStr(writer, str);
2546
2547 if (precision != -1)
2548 length = Py_MIN(precision, length);
2549
2550 arglen = Py_MAX(length, width);
2551 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2552 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2553 else
2554 maxchar = writer->maxchar;
2555
2556 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2557 return -1;
2558
2559 if (width > length) {
2560 fill = width - length;
2561 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2562 return -1;
2563 writer->pos += fill;
2564 }
2565
2566 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2567 str, 0, length);
2568 writer->pos += length;
2569 return 0;
2570}
2571
2572static int
Victor Stinner998b8062018-09-12 00:23:25 +02002573unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 /* UTF-8 */
2577 Py_ssize_t length;
2578 PyObject *unicode;
2579 int res;
2580
2581 length = strlen(str);
2582 if (precision != -1)
2583 length = Py_MIN(length, precision);
2584 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2585 if (unicode == NULL)
2586 return -1;
2587
2588 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2589 Py_DECREF(unicode);
2590 return res;
2591}
2592
Victor Stinner96865452011-03-01 23:44:09 +00002593static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002594unicode_fromformat_arg(_PyUnicodeWriter *writer,
2595 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002596{
Victor Stinnere215d962012-10-06 23:03:36 +02002597 const char *p;
2598 Py_ssize_t len;
2599 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 Py_ssize_t width;
2601 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002602 int longflag;
2603 int longlongflag;
2604 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002605 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002606
2607 p = f;
2608 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002609 zeropad = 0;
2610 if (*f == '0') {
2611 zeropad = 1;
2612 f++;
2613 }
Victor Stinner96865452011-03-01 23:44:09 +00002614
2615 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = -1;
2617 if (Py_ISDIGIT((unsigned)*f)) {
2618 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002619 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002621 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002622 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002624 return NULL;
2625 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002627 f++;
2628 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 }
2630 precision = -1;
2631 if (*f == '.') {
2632 f++;
2633 if (Py_ISDIGIT((unsigned)*f)) {
2634 precision = (*f - '0');
2635 f++;
2636 while (Py_ISDIGIT((unsigned)*f)) {
2637 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2638 PyErr_SetString(PyExc_ValueError,
2639 "precision too big");
2640 return NULL;
2641 }
2642 precision = (precision * 10) + (*f - '0');
2643 f++;
2644 }
2645 }
Victor Stinner96865452011-03-01 23:44:09 +00002646 if (*f == '%') {
2647 /* "%.3%s" => f points to "3" */
2648 f--;
2649 }
2650 }
2651 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002652 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002653 f--;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655
2656 /* Handle %ld, %lu, %lld and %llu. */
2657 longflag = 0;
2658 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002659 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002660 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002661 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002662 longflag = 1;
2663 ++f;
2664 }
Victor Stinner96865452011-03-01 23:44:09 +00002665 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002666 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002667 longlongflag = 1;
2668 f += 2;
2669 }
Victor Stinner96865452011-03-01 23:44:09 +00002670 }
2671 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002672 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002673 size_tflag = 1;
2674 ++f;
2675 }
Victor Stinnere215d962012-10-06 23:03:36 +02002676
2677 if (f[1] == '\0')
2678 writer->overallocate = 0;
2679
2680 switch (*f) {
2681 case 'c':
2682 {
2683 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002684 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002685 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002686 "character argument not in range(0x110000)");
2687 return NULL;
2688 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002689 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002690 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002691 break;
2692 }
2693
2694 case 'i':
2695 case 'd':
2696 case 'u':
2697 case 'x':
2698 {
2699 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002700 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002701 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002702
2703 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002704 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002707 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002708 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002709 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002710 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, size_t));
2713 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002714 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002715 va_arg(*vargs, unsigned int));
2716 }
2717 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002719 }
2720 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002721 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002722 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002724 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002725 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002726 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002727 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002728 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_arg(*vargs, Py_ssize_t));
2730 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002731 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002732 va_arg(*vargs, int));
2733 }
2734 assert(len >= 0);
2735
Victor Stinnere215d962012-10-06 23:03:36 +02002736 if (precision < len)
2737 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738
2739 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002740 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2741 return NULL;
2742
Victor Stinnere215d962012-10-06 23:03:36 +02002743 if (width > precision) {
2744 Py_UCS4 fillchar;
2745 fill = width - precision;
2746 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002747 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2748 return NULL;
2749 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 }
Victor Stinner15a11362012-10-06 23:48:20 +02002751 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002752 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002753 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2754 return NULL;
2755 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757
Victor Stinner4a587072013-11-19 12:54:53 +01002758 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2759 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002760 break;
2761 }
2762
2763 case 'p':
2764 {
2765 char number[MAX_LONG_LONG_CHARS];
2766
2767 len = sprintf(number, "%p", va_arg(*vargs, void*));
2768 assert(len >= 0);
2769
2770 /* %p is ill-defined: ensure leading 0x. */
2771 if (number[1] == 'X')
2772 number[1] = 'x';
2773 else if (number[1] != 'x') {
2774 memmove(number + 2, number,
2775 strlen(number) + 1);
2776 number[0] = '0';
2777 number[1] = 'x';
2778 len += 2;
2779 }
2780
Victor Stinner4a587072013-11-19 12:54:53 +01002781 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002782 return NULL;
2783 break;
2784 }
2785
2786 case 's':
2787 {
2788 /* UTF-8 */
2789 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002790 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 break;
2793 }
2794
2795 case 'U':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 assert(obj && _PyUnicode_CHECK(obj));
2799
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002800 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002801 return NULL;
2802 break;
2803 }
2804
2805 case 'V':
2806 {
2807 PyObject *obj = va_arg(*vargs, PyObject *);
2808 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (obj) {
2810 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002812 return NULL;
2813 }
2814 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002815 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002816 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002817 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002818 }
2819 break;
2820 }
2821
2822 case 'S':
2823 {
2824 PyObject *obj = va_arg(*vargs, PyObject *);
2825 PyObject *str;
2826 assert(obj);
2827 str = PyObject_Str(obj);
2828 if (!str)
2829 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002830 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002831 Py_DECREF(str);
2832 return NULL;
2833 }
2834 Py_DECREF(str);
2835 break;
2836 }
2837
2838 case 'R':
2839 {
2840 PyObject *obj = va_arg(*vargs, PyObject *);
2841 PyObject *repr;
2842 assert(obj);
2843 repr = PyObject_Repr(obj);
2844 if (!repr)
2845 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002847 Py_DECREF(repr);
2848 return NULL;
2849 }
2850 Py_DECREF(repr);
2851 break;
2852 }
2853
2854 case 'A':
2855 {
2856 PyObject *obj = va_arg(*vargs, PyObject *);
2857 PyObject *ascii;
2858 assert(obj);
2859 ascii = PyObject_ASCII(obj);
2860 if (!ascii)
2861 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002862 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002863 Py_DECREF(ascii);
2864 return NULL;
2865 }
2866 Py_DECREF(ascii);
2867 break;
2868 }
2869
2870 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002871 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002873 break;
2874
2875 default:
2876 /* if we stumble upon an unknown formatting code, copy the rest
2877 of the format string to the output string. (we cannot just
2878 skip the code, since there's no way to know what's in the
2879 argument list) */
2880 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002881 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002882 return NULL;
2883 f = p+len;
2884 return f;
2885 }
2886
2887 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002888 return f;
2889}
2890
Walter Dörwaldd2034312007-05-18 16:29:38 +00002891PyObject *
2892PyUnicode_FromFormatV(const char *format, va_list vargs)
2893{
Victor Stinnere215d962012-10-06 23:03:36 +02002894 va_list vargs2;
2895 const char *f;
2896 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002897
Victor Stinner8f674cc2013-04-17 23:02:17 +02002898 _PyUnicodeWriter_Init(&writer);
2899 writer.min_length = strlen(format) + 100;
2900 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002901
Benjamin Peterson0c212142016-09-20 20:39:33 -07002902 // Copy varags to be able to pass a reference to a subfunction.
2903 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002904
2905 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002906 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 f = unicode_fromformat_arg(&writer, f, &vargs2);
2908 if (f == NULL)
2909 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002912 const char *p;
2913 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914
Victor Stinnere215d962012-10-06 23:03:36 +02002915 p = f;
2916 do
2917 {
2918 if ((unsigned char)*p > 127) {
2919 PyErr_Format(PyExc_ValueError,
2920 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2921 "string, got a non-ASCII byte: 0x%02x",
2922 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002923 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002924 }
2925 p++;
2926 }
2927 while (*p != '\0' && *p != '%');
2928 len = p - f;
2929
2930 if (*p == '\0')
2931 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002932
2933 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002934 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002935
2936 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002939 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002940 return _PyUnicodeWriter_Finish(&writer);
2941
2942 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002943 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002944 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946}
2947
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948PyObject *
2949PyUnicode_FromFormat(const char *format, ...)
2950{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 PyObject* ret;
2952 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002953
2954#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002955 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002956#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002957 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002959 ret = PyUnicode_FromFormatV(format, vargs);
2960 va_end(vargs);
2961 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002962}
2963
Serhiy Storchakac46db922018-10-23 22:58:24 +03002964static Py_ssize_t
2965unicode_get_widechar_size(PyObject *unicode)
2966{
2967 Py_ssize_t res;
2968
2969 assert(unicode != NULL);
2970 assert(_PyUnicode_CHECK(unicode));
2971
2972 if (_PyUnicode_WSTR(unicode) != NULL) {
2973 return PyUnicode_WSTR_LENGTH(unicode);
2974 }
2975 assert(PyUnicode_IS_READY(unicode));
2976
2977 res = _PyUnicode_LENGTH(unicode);
2978#if SIZEOF_WCHAR_T == 2
2979 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2980 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2981 const Py_UCS4 *end = s + res;
2982 for (; s < end; ++s) {
2983 if (*s > 0xFFFF) {
2984 ++res;
2985 }
2986 }
2987 }
2988#endif
2989 return res;
2990}
2991
2992static void
2993unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2994{
2995 const wchar_t *wstr;
2996
2997 assert(unicode != NULL);
2998 assert(_PyUnicode_CHECK(unicode));
2999
3000 wstr = _PyUnicode_WSTR(unicode);
3001 if (wstr != NULL) {
3002 memcpy(w, wstr, size * sizeof(wchar_t));
3003 return;
3004 }
3005 assert(PyUnicode_IS_READY(unicode));
3006
3007 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3008 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3009 for (; size--; ++s, ++w) {
3010 *w = *s;
3011 }
3012 }
3013 else {
3014#if SIZEOF_WCHAR_T == 4
3015 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3016 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3017 for (; size--; ++s, ++w) {
3018 *w = *s;
3019 }
3020#else
3021 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3022 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3023 for (; size--; ++s, ++w) {
3024 Py_UCS4 ch = *s;
3025 if (ch > 0xFFFF) {
3026 assert(ch <= MAX_UNICODE);
3027 /* encode surrogate pair in this case */
3028 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3029 if (!size--)
3030 break;
3031 *w = Py_UNICODE_LOW_SURROGATE(ch);
3032 }
3033 else {
3034 *w = ch;
3035 }
3036 }
3037#endif
3038 }
3039}
3040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003041#ifdef HAVE_WCHAR_H
3042
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003043/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003044
Victor Stinnerd88d9832011-09-06 02:00:05 +02003045 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003046 character) required to convert the unicode object. Ignore size argument.
3047
Victor Stinnerd88d9832011-09-06 02:00:05 +02003048 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003049 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003050 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003051Py_ssize_t
3052PyUnicode_AsWideChar(PyObject *unicode,
3053 wchar_t *w,
3054 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003055{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003056 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003058 if (unicode == NULL) {
3059 PyErr_BadInternalCall();
3060 return -1;
3061 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003062 if (!PyUnicode_Check(unicode)) {
3063 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003064 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003065 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003066
3067 res = unicode_get_widechar_size(unicode);
3068 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003069 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003070 }
3071
3072 if (size > res) {
3073 size = res + 1;
3074 }
3075 else {
3076 res = size;
3077 }
3078 unicode_copy_as_widechar(unicode, w, size);
3079 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003080}
3081
Victor Stinner137c34c2010-09-29 10:25:54 +00003082wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003083PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003084 Py_ssize_t *size)
3085{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003086 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003087 Py_ssize_t buflen;
3088
3089 if (unicode == NULL) {
3090 PyErr_BadInternalCall();
3091 return NULL;
3092 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003093 if (!PyUnicode_Check(unicode)) {
3094 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095 return NULL;
3096 }
3097
Serhiy Storchakac46db922018-10-23 22:58:24 +03003098 buflen = unicode_get_widechar_size(unicode);
3099 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003100 if (buffer == NULL) {
3101 PyErr_NoMemory();
3102 return NULL;
3103 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003104 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3105 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003106 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003107 }
3108 else if (wcslen(buffer) != (size_t)buflen) {
3109 PyMem_FREE(buffer);
3110 PyErr_SetString(PyExc_ValueError,
3111 "embedded null character");
3112 return NULL;
3113 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003114 return buffer;
3115}
3116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
Alexander Belopolsky40018472011-02-26 01:02:56 +00003119PyObject *
3120PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003121{
Victor Stinner8faf8212011-12-08 22:14:11 +01003122 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003123 PyErr_SetString(PyExc_ValueError,
3124 "chr() arg not in range(0x110000)");
3125 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003126 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003127
Victor Stinner985a82a2014-01-03 12:53:47 +01003128 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003132PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003134 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003135 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003136 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003137 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003138 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 Py_INCREF(obj);
3140 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003141 }
3142 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 /* For a Unicode subtype that's not a Unicode object,
3144 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003145 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003146 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003147 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003148 "Can't convert '%.100s' object to str implicitly",
3149 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003150 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003154PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003155 const char *encoding,
3156 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003157{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003158 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003159 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003160
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 PyErr_BadInternalCall();
3163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003165
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003166 /* Decoding bytes objects is the most common case and should be fast */
3167 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003168 if (PyBytes_GET_SIZE(obj) == 0)
3169 _Py_RETURN_UNICODE_EMPTY();
3170 v = PyUnicode_Decode(
3171 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3172 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003173 return v;
3174 }
3175
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003176 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003177 PyErr_SetString(PyExc_TypeError,
3178 "decoding str is not supported");
3179 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003180 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003181
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003182 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3183 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3184 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003185 "decoding to str: need a bytes-like object, %.80s found",
3186 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003187 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003188 }
Tim Petersced69f82003-09-16 20:30:58 +00003189
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003190 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003191 PyBuffer_Release(&buffer);
3192 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003194
Serhiy Storchaka05997252013-01-26 12:14:02 +02003195 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003196 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003197 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198}
3199
Victor Stinnerebe17e02016-10-12 13:57:45 +02003200/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3201 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3202 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003203int
3204_Py_normalize_encoding(const char *encoding,
3205 char *lower,
3206 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003208 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003209 char *l;
3210 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003211 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212
Victor Stinner942889a2016-09-05 15:40:10 -07003213 assert(encoding != NULL);
3214
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003215 e = encoding;
3216 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003217 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003218 punct = 0;
3219 while (1) {
3220 char c = *e;
3221 if (c == 0) {
3222 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003223 }
Victor Stinner942889a2016-09-05 15:40:10 -07003224
3225 if (Py_ISALNUM(c) || c == '.') {
3226 if (punct && l != lower) {
3227 if (l == l_end) {
3228 return 0;
3229 }
3230 *l++ = '_';
3231 }
3232 punct = 0;
3233
3234 if (l == l_end) {
3235 return 0;
3236 }
3237 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003238 }
3239 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003240 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003241 }
Victor Stinner942889a2016-09-05 15:40:10 -07003242
3243 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003244 }
3245 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003246 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003247}
3248
Alexander Belopolsky40018472011-02-26 01:02:56 +00003249PyObject *
3250PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003251 Py_ssize_t size,
3252 const char *encoding,
3253 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003254{
3255 PyObject *buffer = NULL, *unicode;
3256 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003257 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3258
3259 if (encoding == NULL) {
3260 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3261 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003262
Fred Drakee4315f52000-05-09 19:53:39 +00003263 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003264 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3265 char *lower = buflower;
3266
3267 /* Fast paths */
3268 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3269 lower += 3;
3270 if (*lower == '_') {
3271 /* Match "utf8" and "utf_8" */
3272 lower++;
3273 }
3274
3275 if (lower[0] == '8' && lower[1] == 0) {
3276 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3277 }
3278 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3279 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3280 }
3281 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3282 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3283 }
3284 }
3285 else {
3286 if (strcmp(lower, "ascii") == 0
3287 || strcmp(lower, "us_ascii") == 0) {
3288 return PyUnicode_DecodeASCII(s, size, errors);
3289 }
Steve Dowercc16be82016-09-08 10:35:16 -07003290 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003291 else if (strcmp(lower, "mbcs") == 0) {
3292 return PyUnicode_DecodeMBCS(s, size, errors);
3293 }
3294 #endif
3295 else if (strcmp(lower, "latin1") == 0
3296 || strcmp(lower, "latin_1") == 0
3297 || strcmp(lower, "iso_8859_1") == 0
3298 || strcmp(lower, "iso8859_1") == 0) {
3299 return PyUnicode_DecodeLatin1(s, size, errors);
3300 }
3301 }
Victor Stinner37296e82010-06-10 13:36:23 +00003302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
3304 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003305 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003306 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003307 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003308 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 if (buffer == NULL)
3310 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003311 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 if (unicode == NULL)
3313 goto onError;
3314 if (!PyUnicode_Check(unicode)) {
3315 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003316 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003317 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003318 encoding,
3319 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 Py_DECREF(unicode);
3321 goto onError;
3322 }
3323 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003324 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003325
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_XDECREF(buffer);
3328 return NULL;
3329}
3330
Alexander Belopolsky40018472011-02-26 01:02:56 +00003331PyObject *
3332PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003333 const char *encoding,
3334 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003336 if (!PyUnicode_Check(unicode)) {
3337 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003338 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339 }
3340
Serhiy Storchaka00939072016-10-27 21:05:49 +03003341 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3342 "PyUnicode_AsDecodedObject() is deprecated; "
3343 "use PyCodec_Decode() to decode from str", 1) < 0)
3344 return NULL;
3345
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348
3349 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003350 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003351}
3352
Alexander Belopolsky40018472011-02-26 01:02:56 +00003353PyObject *
3354PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003355 const char *encoding,
3356 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357{
3358 PyObject *v;
3359
3360 if (!PyUnicode_Check(unicode)) {
3361 PyErr_BadArgument();
3362 goto onError;
3363 }
3364
Serhiy Storchaka00939072016-10-27 21:05:49 +03003365 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3366 "PyUnicode_AsDecodedUnicode() is deprecated; "
3367 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3368 return NULL;
3369
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003372
3373 /* Decode via the codec registry */
3374 v = PyCodec_Decode(unicode, encoding, errors);
3375 if (v == NULL)
3376 goto onError;
3377 if (!PyUnicode_Check(v)) {
3378 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003379 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003380 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003381 encoding,
3382 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003383 Py_DECREF(v);
3384 goto onError;
3385 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003386 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 return NULL;
3390}
3391
Alexander Belopolsky40018472011-02-26 01:02:56 +00003392PyObject *
3393PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003394 Py_ssize_t size,
3395 const char *encoding,
3396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397{
3398 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003399
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003400 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3404 Py_DECREF(unicode);
3405 return v;
3406}
3407
Alexander Belopolsky40018472011-02-26 01:02:56 +00003408PyObject *
3409PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003410 const char *encoding,
3411 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003412{
3413 PyObject *v;
3414
3415 if (!PyUnicode_Check(unicode)) {
3416 PyErr_BadArgument();
3417 goto onError;
3418 }
3419
Serhiy Storchaka00939072016-10-27 21:05:49 +03003420 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3421 "PyUnicode_AsEncodedObject() is deprecated; "
3422 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3423 "or PyCodec_Encode() for generic encoding", 1) < 0)
3424 return NULL;
3425
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003426 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003428
3429 /* Encode via the codec registry */
3430 v = PyCodec_Encode(unicode, encoding, errors);
3431 if (v == NULL)
3432 goto onError;
3433 return v;
3434
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003436 return NULL;
3437}
3438
Victor Stinner1b579672011-12-17 05:47:23 +01003439
Victor Stinner2cba6b82018-01-10 22:46:15 +01003440static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003441unicode_encode_locale(PyObject *unicode, const char *errors,
3442 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003444 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003446 Py_ssize_t wlen;
3447 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3448 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003449 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003450 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003452 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003453 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003454 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003455 return NULL;
3456 }
3457
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003458 char *str;
3459 size_t error_pos;
3460 const char *reason;
3461 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003462 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003463 PyMem_Free(wstr);
3464
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003465 if (res != 0) {
3466 if (res == -2) {
3467 PyObject *exc;
3468 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3469 "locale", unicode,
3470 (Py_ssize_t)error_pos,
3471 (Py_ssize_t)(error_pos+1),
3472 reason);
3473 if (exc != NULL) {
3474 PyCodec_StrictErrors(exc);
3475 Py_DECREF(exc);
3476 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003477 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003478 else if (res == -3) {
3479 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3480 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003481 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003482 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003483 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003484 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003485 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003486
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003487 PyObject *bytes = PyBytes_FromString(str);
3488 PyMem_RawFree(str);
3489 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003490}
3491
Victor Stinnerad158722010-10-27 00:25:46 +00003492PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003493PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3494{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003495 return unicode_encode_locale(unicode, errors, 1);
3496}
3497
3498PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003499PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003500{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003501 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003502 const _PyCoreConfig *config = &interp->core_config;
3503#if defined(__APPLE__)
3504 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3505#else
Victor Stinner793b5312011-04-27 00:24:21 +02003506 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3507 cannot use it to encode and decode filenames before it is loaded. Load
3508 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003509 implementation of the locale codec until the codec registry is
3510 initialized and the Python codec is loaded. See initfsencoding(). */
3511 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003512 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003513 config->filesystem_encoding,
3514 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003515 }
3516 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003517 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003518 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003519 }
Victor Stinnerad158722010-10-27 00:25:46 +00003520#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003521}
3522
Alexander Belopolsky40018472011-02-26 01:02:56 +00003523PyObject *
3524PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003525 const char *encoding,
3526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527{
3528 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003529 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 if (!PyUnicode_Check(unicode)) {
3532 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 }
Fred Drakee4315f52000-05-09 19:53:39 +00003535
Victor Stinner942889a2016-09-05 15:40:10 -07003536 if (encoding == NULL) {
3537 return _PyUnicode_AsUTF8String(unicode, errors);
3538 }
3539
Fred Drakee4315f52000-05-09 19:53:39 +00003540 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003541 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3542 char *lower = buflower;
3543
3544 /* Fast paths */
3545 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3546 lower += 3;
3547 if (*lower == '_') {
3548 /* Match "utf8" and "utf_8" */
3549 lower++;
3550 }
3551
3552 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003553 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003554 }
3555 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3556 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3557 }
3558 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3559 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3560 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003561 }
Victor Stinner942889a2016-09-05 15:40:10 -07003562 else {
3563 if (strcmp(lower, "ascii") == 0
3564 || strcmp(lower, "us_ascii") == 0) {
3565 return _PyUnicode_AsASCIIString(unicode, errors);
3566 }
Steve Dowercc16be82016-09-08 10:35:16 -07003567#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003568 else if (strcmp(lower, "mbcs") == 0) {
3569 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3570 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003571#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003572 else if (strcmp(lower, "latin1") == 0 ||
3573 strcmp(lower, "latin_1") == 0 ||
3574 strcmp(lower, "iso_8859_1") == 0 ||
3575 strcmp(lower, "iso8859_1") == 0) {
3576 return _PyUnicode_AsLatin1String(unicode, errors);
3577 }
3578 }
Victor Stinner37296e82010-06-10 13:36:23 +00003579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
3581 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003582 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003584 return NULL;
3585
3586 /* The normal path */
3587 if (PyBytes_Check(v))
3588 return v;
3589
3590 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003591 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003592 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003593 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003594
3595 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003596 "encoder %s returned bytearray instead of bytes; "
3597 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003598 encoding);
3599 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003600 Py_DECREF(v);
3601 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003602 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003603
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003604 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3605 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003606 Py_DECREF(v);
3607 return b;
3608 }
3609
3610 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003611 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003612 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003613 encoding,
3614 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003615 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616 return NULL;
3617}
3618
Alexander Belopolsky40018472011-02-26 01:02:56 +00003619PyObject *
3620PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003621 const char *encoding,
3622 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623{
3624 PyObject *v;
3625
3626 if (!PyUnicode_Check(unicode)) {
3627 PyErr_BadArgument();
3628 goto onError;
3629 }
3630
Serhiy Storchaka00939072016-10-27 21:05:49 +03003631 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3632 "PyUnicode_AsEncodedUnicode() is deprecated; "
3633 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3634 return NULL;
3635
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003636 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003638
3639 /* Encode via the codec registry */
3640 v = PyCodec_Encode(unicode, encoding, errors);
3641 if (v == NULL)
3642 goto onError;
3643 if (!PyUnicode_Check(v)) {
3644 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003645 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003646 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003647 encoding,
3648 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003649 Py_DECREF(v);
3650 goto onError;
3651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003653
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 return NULL;
3656}
3657
Victor Stinner2cba6b82018-01-10 22:46:15 +01003658static PyObject*
3659unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3660 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003661{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003662 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003663
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003664 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3665 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003666 return NULL;
3667 }
3668
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003669 wchar_t *wstr;
3670 size_t wlen;
3671 const char *reason;
3672 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003673 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003674 if (res != 0) {
3675 if (res == -2) {
3676 PyObject *exc;
3677 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3678 "locale", str, len,
3679 (Py_ssize_t)wlen,
3680 (Py_ssize_t)(wlen + 1),
3681 reason);
3682 if (exc != NULL) {
3683 PyCodec_StrictErrors(exc);
3684 Py_DECREF(exc);
3685 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003686 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003687 else if (res == -3) {
3688 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3689 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003690 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003691 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003692 }
Victor Stinner2f197072011-12-17 07:08:30 +01003693 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003694 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003695
3696 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3697 PyMem_RawFree(wstr);
3698 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003699}
3700
3701PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003702PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3703 const char *errors)
3704{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003705 return unicode_decode_locale(str, len, errors, 1);
3706}
3707
3708PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003709PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003710{
3711 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003712 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003713}
3714
3715
3716PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003717PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003719 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3720}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003721
Christian Heimes5894ba72007-11-04 11:43:14 +00003722PyObject*
3723PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3724{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003725 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003726 const _PyCoreConfig *config = &interp->core_config;
3727#if defined(__APPLE__)
3728 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3729#else
Victor Stinner793b5312011-04-27 00:24:21 +02003730 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3731 cannot use it to encode and decode filenames before it is loaded. Load
3732 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003733 implementation of the locale codec until the codec registry is
3734 initialized and the Python codec is loaded. See initfsencoding(). */
3735 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003736 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003737 config->filesystem_encoding,
3738 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003739 }
3740 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003741 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003742 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003743 }
Victor Stinnerad158722010-10-27 00:25:46 +00003744#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745}
3746
Martin v. Löwis011e8422009-05-05 04:43:17 +00003747
3748int
3749PyUnicode_FSConverter(PyObject* arg, void* addr)
3750{
Brett Cannonec6ce872016-09-06 15:50:29 -07003751 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752 PyObject *output = NULL;
3753 Py_ssize_t size;
3754 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003755 if (arg == NULL) {
3756 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003757 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003758 return 1;
3759 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003760 path = PyOS_FSPath(arg);
3761 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003762 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003763 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003764 if (PyBytes_Check(path)) {
3765 output = path;
3766 }
3767 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3768 output = PyUnicode_EncodeFSDefault(path);
3769 Py_DECREF(path);
3770 if (!output) {
3771 return 0;
3772 }
3773 assert(PyBytes_Check(output));
3774 }
3775
Victor Stinner0ea2a462010-04-30 00:22:08 +00003776 size = PyBytes_GET_SIZE(output);
3777 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003778 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003779 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003780 Py_DECREF(output);
3781 return 0;
3782 }
3783 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003784 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003785}
3786
3787
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003788int
3789PyUnicode_FSDecoder(PyObject* arg, void* addr)
3790{
Brett Cannona5711202016-09-06 19:36:01 -07003791 int is_buffer = 0;
3792 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003793 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794 if (arg == NULL) {
3795 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003796 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003797 return 1;
3798 }
Brett Cannona5711202016-09-06 19:36:01 -07003799
3800 is_buffer = PyObject_CheckBuffer(arg);
3801 if (!is_buffer) {
3802 path = PyOS_FSPath(arg);
3803 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003804 return 0;
3805 }
Brett Cannona5711202016-09-06 19:36:01 -07003806 }
3807 else {
3808 path = arg;
3809 Py_INCREF(arg);
3810 }
3811
3812 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003813 output = path;
3814 }
3815 else if (PyBytes_Check(path) || is_buffer) {
3816 PyObject *path_bytes = NULL;
3817
3818 if (!PyBytes_Check(path) &&
3819 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003820 "path should be string, bytes, or os.PathLike, not %.200s",
3821 Py_TYPE(arg)->tp_name)) {
3822 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003823 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003824 }
3825 path_bytes = PyBytes_FromObject(path);
3826 Py_DECREF(path);
3827 if (!path_bytes) {
3828 return 0;
3829 }
3830 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3831 PyBytes_GET_SIZE(path_bytes));
3832 Py_DECREF(path_bytes);
3833 if (!output) {
3834 return 0;
3835 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003836 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003837 else {
3838 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003839 "path should be string, bytes, or os.PathLike, not %.200s",
3840 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003841 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003842 return 0;
3843 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003844 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003845 Py_DECREF(output);
3846 return 0;
3847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003849 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003850 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003851 Py_DECREF(output);
3852 return 0;
3853 }
3854 *(PyObject**)addr = output;
3855 return Py_CLEANUP_SUPPORTED;
3856}
3857
3858
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003859const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003861{
Christian Heimesf3863112007-11-22 07:46:41 +00003862 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003864 if (!PyUnicode_Check(unicode)) {
3865 PyErr_BadArgument();
3866 return NULL;
3867 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003869 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003871 if (PyUnicode_UTF8(unicode) == NULL) {
3872 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003873 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 if (bytes == NULL)
3875 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3877 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003878 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 Py_DECREF(bytes);
3880 return NULL;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003883 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 PyBytes_AS_STRING(bytes),
3885 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 Py_DECREF(bytes);
3887 }
3888
3889 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003890 *psize = PyUnicode_UTF8_LENGTH(unicode);
3891 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003892}
3893
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003894const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3898}
3899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900Py_UNICODE *
3901PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 if (!PyUnicode_Check(unicode)) {
3904 PyErr_BadArgument();
3905 return NULL;
3906 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003907 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3908 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003910 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912
Serhiy Storchakac46db922018-10-23 22:58:24 +03003913 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3914 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3915 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003918 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3919 if (w == NULL) {
3920 PyErr_NoMemory();
3921 return NULL;
3922 }
3923 unicode_copy_as_widechar(unicode, w, wlen + 1);
3924 _PyUnicode_WSTR(unicode) = w;
3925 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3926 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 }
3928 }
3929 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003931 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003932}
3933
Alexander Belopolsky40018472011-02-26 01:02:56 +00003934Py_UNICODE *
3935PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003940const Py_UNICODE *
3941_PyUnicode_AsUnicode(PyObject *unicode)
3942{
3943 Py_ssize_t size;
3944 const Py_UNICODE *wstr;
3945
3946 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3947 if (wstr && wcslen(wstr) != (size_t)size) {
3948 PyErr_SetString(PyExc_ValueError, "embedded null character");
3949 return NULL;
3950 }
3951 return wstr;
3952}
3953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954
Alexander Belopolsky40018472011-02-26 01:02:56 +00003955Py_ssize_t
3956PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957{
3958 if (!PyUnicode_Check(unicode)) {
3959 PyErr_BadArgument();
3960 goto onError;
3961 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003962 if (_PyUnicode_WSTR(unicode) == NULL) {
3963 if (PyUnicode_AsUnicode(unicode) == NULL)
3964 goto onError;
3965 }
3966 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
Benjamin Peterson29060642009-01-31 22:14:21 +00003968 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 return -1;
3970}
3971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972Py_ssize_t
3973PyUnicode_GetLength(PyObject *unicode)
3974{
Victor Stinner07621332012-06-16 04:53:46 +02003975 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 PyErr_BadArgument();
3977 return -1;
3978 }
Victor Stinner07621332012-06-16 04:53:46 +02003979 if (PyUnicode_READY(unicode) == -1)
3980 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 return PyUnicode_GET_LENGTH(unicode);
3982}
3983
3984Py_UCS4
3985PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3986{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003987 void *data;
3988 int kind;
3989
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003990 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003991 PyErr_BadArgument();
3992 return (Py_UCS4)-1;
3993 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003994 if (PyUnicode_READY(unicode) == -1) {
3995 return (Py_UCS4)-1;
3996 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003997 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003998 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 return (Py_UCS4)-1;
4000 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004001 data = PyUnicode_DATA(unicode);
4002 kind = PyUnicode_KIND(unicode);
4003 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004004}
4005
4006int
4007PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4008{
4009 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004010 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 return -1;
4012 }
Victor Stinner488fa492011-12-12 00:01:39 +01004013 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004014 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004015 PyErr_SetString(PyExc_IndexError, "string index out of range");
4016 return -1;
4017 }
Victor Stinner488fa492011-12-12 00:01:39 +01004018 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004019 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004020 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4021 PyErr_SetString(PyExc_ValueError, "character out of range");
4022 return -1;
4023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4025 index, ch);
4026 return 0;
4027}
4028
Alexander Belopolsky40018472011-02-26 01:02:56 +00004029const char *
4030PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004031{
Victor Stinner42cb4622010-09-01 19:39:01 +00004032 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004033}
4034
Victor Stinner554f3f02010-06-16 23:33:54 +00004035/* create or adjust a UnicodeDecodeError */
4036static void
4037make_decode_exception(PyObject **exceptionObject,
4038 const char *encoding,
4039 const char *input, Py_ssize_t length,
4040 Py_ssize_t startpos, Py_ssize_t endpos,
4041 const char *reason)
4042{
4043 if (*exceptionObject == NULL) {
4044 *exceptionObject = PyUnicodeDecodeError_Create(
4045 encoding, input, length, startpos, endpos, reason);
4046 }
4047 else {
4048 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4049 goto onError;
4050 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4051 goto onError;
4052 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4053 goto onError;
4054 }
4055 return;
4056
4057onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004058 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004059}
4060
Steve Dowercc16be82016-09-08 10:35:16 -07004061#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004062static int
4063widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4064{
4065 if (newsize > *size) {
4066 wchar_t *newbuf = *buf;
4067 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4068 PyErr_NoMemory();
4069 return -1;
4070 }
4071 *buf = newbuf;
4072 }
4073 *size = newsize;
4074 return 0;
4075}
4076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077/* error handling callback helper:
4078 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004079 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 and adjust various state variables.
4081 return 0 on success, -1 on error
4082*/
4083
Alexander Belopolsky40018472011-02-26 01:02:56 +00004084static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004085unicode_decode_call_errorhandler_wchar(
4086 const char *errors, PyObject **errorHandler,
4087 const char *encoding, const char *reason,
4088 const char **input, const char **inend, Py_ssize_t *startinpos,
4089 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004090 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004092 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093
4094 PyObject *restuple = NULL;
4095 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004096 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004097 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004098 Py_ssize_t requiredsize;
4099 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004100 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004101 wchar_t *repwstr;
4102 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103
4104 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 *errorHandler = PyCodec_LookupError(errors);
4106 if (*errorHandler == NULL)
4107 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 }
4109
Victor Stinner554f3f02010-06-16 23:33:54 +00004110 make_decode_exception(exceptionObject,
4111 encoding,
4112 *input, *inend - *input,
4113 *startinpos, *endinpos,
4114 reason);
4115 if (*exceptionObject == NULL)
4116 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004118 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004122 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004125 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004127
4128 /* Copy back the bytes variables, which might have been modified by the
4129 callback */
4130 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4131 if (!inputobj)
4132 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 *input = PyBytes_AS_STRING(inputobj);
4134 insize = PyBytes_GET_SIZE(inputobj);
4135 *inend = *input + insize;
4136 /* we can DECREF safely, as the exception has another reference,
4137 so the object won't go away. */
4138 Py_DECREF(inputobj);
4139
4140 if (newpos<0)
4141 newpos = insize+newpos;
4142 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004143 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144 goto onError;
4145 }
4146
4147 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4148 if (repwstr == NULL)
4149 goto onError;
4150 /* need more space? (at least enough for what we
4151 have+the replacement+the rest of the string (starting
4152 at the new input position), so we won't have to check space
4153 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004154 requiredsize = *outpos;
4155 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4156 goto overflow;
4157 requiredsize += repwlen;
4158 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4159 goto overflow;
4160 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004161 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004162 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004163 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004165 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004166 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004167 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004168 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004169 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004171 *endinpos = newpos;
4172 *inptr = *input + newpos;
4173
4174 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004175 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 return 0;
4177
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004178 overflow:
4179 PyErr_SetString(PyExc_OverflowError,
4180 "decoded result is too long for a Python string");
4181
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 onError:
4183 Py_XDECREF(restuple);
4184 return -1;
4185}
Steve Dowercc16be82016-09-08 10:35:16 -07004186#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004187
4188static int
4189unicode_decode_call_errorhandler_writer(
4190 const char *errors, PyObject **errorHandler,
4191 const char *encoding, const char *reason,
4192 const char **input, const char **inend, Py_ssize_t *startinpos,
4193 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4194 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4195{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004196 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004197
4198 PyObject *restuple = NULL;
4199 PyObject *repunicode = NULL;
4200 Py_ssize_t insize;
4201 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004202 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004203 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004205 int need_to_grow = 0;
4206 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004207
4208 if (*errorHandler == NULL) {
4209 *errorHandler = PyCodec_LookupError(errors);
4210 if (*errorHandler == NULL)
4211 goto onError;
4212 }
4213
4214 make_decode_exception(exceptionObject,
4215 encoding,
4216 *input, *inend - *input,
4217 *startinpos, *endinpos,
4218 reason);
4219 if (*exceptionObject == NULL)
4220 goto onError;
4221
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004222 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004223 if (restuple == NULL)
4224 goto onError;
4225 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004226 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227 goto onError;
4228 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004229 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004230 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004231
4232 /* Copy back the bytes variables, which might have been modified by the
4233 callback */
4234 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4235 if (!inputobj)
4236 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004237 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004238 *input = PyBytes_AS_STRING(inputobj);
4239 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004240 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004241 /* we can DECREF safely, as the exception has another reference,
4242 so the object won't go away. */
4243 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004247 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004248 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251
Victor Stinner170ca6f2013-04-18 00:25:28 +02004252 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004253 if (replen > 1) {
4254 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004255 need_to_grow = 1;
4256 }
4257 new_inptr = *input + newpos;
4258 if (*inend - new_inptr > remain) {
4259 /* We don't know the decoding algorithm here so we make the worst
4260 assumption that one byte decodes to one unicode character.
4261 If unfortunately one byte could decode to more unicode characters,
4262 the decoder may write out-of-bound then. Is it possible for the
4263 algorithms using this function? */
4264 writer->min_length += *inend - new_inptr - remain;
4265 need_to_grow = 1;
4266 }
4267 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004268 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004269 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004270 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4271 goto onError;
4272 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004274 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004277 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004280 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286}
4287
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288/* --- UTF-7 Codec -------------------------------------------------------- */
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* See RFC2152 for details. We encode conservatively and decode liberally. */
4291
4292/* Three simple macros defining base-64. */
4293
4294/* Is c a base-64 character? */
4295
4296#define IS_BASE64(c) \
4297 (((c) >= 'A' && (c) <= 'Z') || \
4298 ((c) >= 'a' && (c) <= 'z') || \
4299 ((c) >= '0' && (c) <= '9') || \
4300 (c) == '+' || (c) == '/')
4301
4302/* given that c is a base-64 character, what is its base-64 value? */
4303
4304#define FROM_BASE64(c) \
4305 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4306 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4307 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4308 (c) == '+' ? 62 : 63)
4309
4310/* What is the base-64 character of the bottom 6 bits of n? */
4311
4312#define TO_BASE64(n) \
4313 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4314
4315/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4316 * decoded as itself. We are permissive on decoding; the only ASCII
4317 * byte not decoding to itself is the + which begins a base64
4318 * string. */
4319
4320#define DECODE_DIRECT(c) \
4321 ((c) <= 127 && (c) != '+')
4322
4323/* The UTF-7 encoder treats ASCII characters differently according to
4324 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4325 * the above). See RFC2152. This array identifies these different
4326 * sets:
4327 * 0 : "Set D"
4328 * alphanumeric and '(),-./:?
4329 * 1 : "Set O"
4330 * !"#$%&*;<=>@[]^_`{|}
4331 * 2 : "whitespace"
4332 * ht nl cr sp
4333 * 3 : special (must be base64 encoded)
4334 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4335 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336
Tim Petersced69f82003-09-16 20:30:58 +00004337static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338char utf7_category[128] = {
4339/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4340 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4341/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4342 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4343/* sp ! " # $ % & ' ( ) * + , - . / */
4344 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4345/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4347/* @ A B C D E F G H I J K L M N O */
4348 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4349/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4351/* ` a b c d e f g h i j k l m n o */
4352 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4353/* p q r s t u v w x y z { | } ~ del */
4354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355};
4356
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357/* ENCODE_DIRECT: this character should be encoded as itself. The
4358 * answer depends on whether we are encoding set O as itself, and also
4359 * on whether we are encoding whitespace as itself. RFC2152 makes it
4360 * clear that the answers to these questions vary between
4361 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004362
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363#define ENCODE_DIRECT(c, directO, directWS) \
4364 ((c) < 128 && (c) > 0 && \
4365 ((utf7_category[(c)] == 0) || \
4366 (directWS && (utf7_category[(c)] == 2)) || \
4367 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368
Alexander Belopolsky40018472011-02-26 01:02:56 +00004369PyObject *
4370PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004371 Py_ssize_t size,
4372 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004374 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4375}
4376
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377/* The decoder. The only state we preserve is our read position,
4378 * i.e. how many characters we have consumed. So if we end in the
4379 * middle of a shift sequence we have to back off the read position
4380 * and the output to the beginning of the sequence, otherwise we lose
4381 * all the shift state (seen bits, number of bits seen, high
4382 * surrogate). */
4383
Alexander Belopolsky40018472011-02-26 01:02:56 +00004384PyObject *
4385PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004386 Py_ssize_t size,
4387 const char *errors,
4388 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t startinpos;
4392 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 const char *errmsg = "";
4396 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 unsigned int base64bits = 0;
4399 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004400 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 PyObject *errorHandler = NULL;
4402 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004404 if (size == 0) {
4405 if (consumed)
4406 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004407 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004408 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004411 _PyUnicodeWriter_Init(&writer);
4412 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413
4414 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 e = s + size;
4416
4417 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004420 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 if (inShift) { /* in a base-64 section */
4423 if (IS_BASE64(ch)) { /* consume a base-64 character */
4424 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4425 base64bits += 6;
4426 s++;
4427 if (base64bits >= 16) {
4428 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004429 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 base64bits -= 16;
4431 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004432 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 if (surrogate) {
4434 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004435 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4436 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004437 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004438 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004440 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 }
4442 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446 }
4447 }
Victor Stinner551ac952011-11-29 22:58:13 +01004448 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 /* first surrogate */
4450 surrogate = outCh;
4451 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004453 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004454 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 }
4456 }
4457 }
4458 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 if (base64bits > 0) { /* left-over bits */
4461 if (base64bits >= 6) {
4462 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004463 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 errmsg = "partial character in shift sequence";
4465 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 else {
4468 /* Some bits remain; they should be zero */
4469 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004470 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 errmsg = "non-zero padding bits in shift sequence";
4472 goto utf7Error;
4473 }
4474 }
4475 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004476 if (surrogate && DECODE_DIRECT(ch)) {
4477 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4478 goto onError;
4479 }
4480 surrogate = 0;
4481 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 /* '-' is absorbed; other terminating
4483 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004484 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 }
4487 }
4488 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 s++; /* consume '+' */
4491 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004493 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004494 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004496 else if (s < e && !IS_BASE64(*s)) {
4497 s++;
4498 errmsg = "ill-formed sequence";
4499 goto utf7Error;
4500 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004503 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004504 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004506 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 }
4508 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004511 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004512 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 else {
4515 startinpos = s-starts;
4516 s++;
4517 errmsg = "unexpected special character";
4518 goto utf7Error;
4519 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 errors, &errorHandler,
4525 "utf7", errmsg,
4526 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
4530
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 /* end of string */
4532
4533 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4534 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004535 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 if (surrogate ||
4537 (base64bits >= 6) ||
4538 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004540 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 errors, &errorHandler,
4542 "utf7", "unterminated shift sequence",
4543 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004544 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 goto onError;
4546 if (s < e)
4547 goto restart;
4548 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550
4551 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004554 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004555 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004556 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004557 writer.kind, writer.data, shiftOutStart);
4558 Py_XDECREF(errorHandler);
4559 Py_XDECREF(exc);
4560 _PyUnicodeWriter_Dealloc(&writer);
4561 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
4565 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 Py_XDECREF(errorHandler);
4571 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004572 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 Py_XDECREF(errorHandler);
4576 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 return NULL;
4579}
4580
4581
Alexander Belopolsky40018472011-02-26 01:02:56 +00004582PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004583_PyUnicode_EncodeUTF7(PyObject *str,
4584 int base64SetO,
4585 int base64WhiteSpace,
4586 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004588 int kind;
4589 void *data;
4590 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004591 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004593 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 unsigned int base64bits = 0;
4595 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596 char * out;
4597 char * start;
4598
Benjamin Petersonbac79492012-01-14 13:34:47 -05004599 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004600 return NULL;
4601 kind = PyUnicode_KIND(str);
4602 data = PyUnicode_DATA(str);
4603 len = PyUnicode_GET_LENGTH(str);
4604
4605 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004608 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004609 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004610 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004611 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004612 if (v == NULL)
4613 return NULL;
4614
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004615 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004616 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004617 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 if (inShift) {
4620 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4621 /* shifting out */
4622 if (base64bits) { /* output remaining bits */
4623 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4624 base64buffer = 0;
4625 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
4627 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 /* Characters not in the BASE64 set implicitly unshift the sequence
4629 so no '-' is required, except if the character is itself a '-' */
4630 if (IS_BASE64(ch) || ch == '-') {
4631 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 *out++ = (char) ch;
4634 }
4635 else {
4636 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 else { /* not in a shift sequence */
4640 if (ch == '+') {
4641 *out++ = '+';
4642 *out++ = '-';
4643 }
4644 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4645 *out++ = (char) ch;
4646 }
4647 else {
4648 *out++ = '+';
4649 inShift = 1;
4650 goto encode_char;
4651 }
4652 }
4653 continue;
4654encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004656 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004657
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 /* code first surrogate */
4659 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004660 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 while (base64bits >= 6) {
4662 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4663 base64bits -= 6;
4664 }
4665 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004666 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 base64bits += 16;
4669 base64buffer = (base64buffer << 16) | ch;
4670 while (base64bits >= 6) {
4671 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4672 base64bits -= 6;
4673 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004674 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 if (base64bits)
4676 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4677 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004679 if (_PyBytes_Resize(&v, out - start) < 0)
4680 return NULL;
4681 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004683PyObject *
4684PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4685 Py_ssize_t size,
4686 int base64SetO,
4687 int base64WhiteSpace,
4688 const char *errors)
4689{
4690 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004691 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004692 if (tmp == NULL)
4693 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004694 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004695 base64WhiteSpace, errors);
4696 Py_DECREF(tmp);
4697 return result;
4698}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700#undef IS_BASE64
4701#undef FROM_BASE64
4702#undef TO_BASE64
4703#undef DECODE_DIRECT
4704#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706/* --- UTF-8 Codec -------------------------------------------------------- */
4707
Alexander Belopolsky40018472011-02-26 01:02:56 +00004708PyObject *
4709PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004710 Py_ssize_t size,
4711 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712{
Walter Dörwald69652032004-09-07 20:24:22 +00004713 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4714}
4715
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716#include "stringlib/asciilib.h"
4717#include "stringlib/codecs.h"
4718#include "stringlib/undef.h"
4719
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004720#include "stringlib/ucs1lib.h"
4721#include "stringlib/codecs.h"
4722#include "stringlib/undef.h"
4723
4724#include "stringlib/ucs2lib.h"
4725#include "stringlib/codecs.h"
4726#include "stringlib/undef.h"
4727
4728#include "stringlib/ucs4lib.h"
4729#include "stringlib/codecs.h"
4730#include "stringlib/undef.h"
4731
Antoine Pitrouab868312009-01-10 15:40:25 +00004732/* Mask to quickly check whether a C 'long' contains a
4733 non-ASCII, UTF8-encoded char. */
4734#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004735# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004736#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004737# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004738#else
4739# error C 'long' size should be either 4 or 8!
4740#endif
4741
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742static Py_ssize_t
4743ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004744{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004746 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004748 /*
4749 * Issue #17237: m68k is a bit different from most architectures in
4750 * that objects do not use "natural alignment" - for example, int and
4751 * long are only aligned at 2-byte boundaries. Therefore the assert()
4752 * won't work; also, tests have shown that skipping the "optimised
4753 * version" will even speed up m68k.
4754 */
4755#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004757 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4758 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 /* Fast path, see in STRINGLIB(utf8_decode) for
4760 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004761 /* Help allocation */
4762 const char *_p = p;
4763 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 while (_p < aligned_end) {
4765 unsigned long value = *(const unsigned long *) _p;
4766 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 *((unsigned long *)q) = value;
4769 _p += SIZEOF_LONG;
4770 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004771 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 p = _p;
4773 while (p < end) {
4774 if ((unsigned char)*p & 0x80)
4775 break;
4776 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004781#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 while (p < end) {
4783 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4784 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004785 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004786 /* Help allocation */
4787 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (_p < aligned_end) {
4789 unsigned long value = *(unsigned long *) _p;
4790 if (value & ASCII_CHAR_MASK)
4791 break;
4792 _p += SIZEOF_LONG;
4793 }
4794 p = _p;
4795 if (_p == end)
4796 break;
4797 }
4798 if ((unsigned char)*p & 0x80)
4799 break;
4800 ++p;
4801 }
4802 memcpy(dest, start, p - start);
4803 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804}
Antoine Pitrouab868312009-01-10 15:40:25 +00004805
Victor Stinner785938e2011-12-11 20:09:03 +01004806PyObject *
4807PyUnicode_DecodeUTF8Stateful(const char *s,
4808 Py_ssize_t size,
4809 const char *errors,
4810 Py_ssize_t *consumed)
4811{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004812 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004813 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815
4816 Py_ssize_t startinpos;
4817 Py_ssize_t endinpos;
4818 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004819 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004821 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004822
4823 if (size == 0) {
4824 if (consumed)
4825 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004826 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004827 }
4828
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4830 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004831 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 *consumed = 1;
4833 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004834 }
4835
Victor Stinner8f674cc2013-04-17 23:02:17 +02004836 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004837 writer.min_length = size;
4838 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004839 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004840
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004841 writer.pos = ascii_decode(s, end, writer.data);
4842 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 while (s < end) {
4844 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004846
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004848 if (PyUnicode_IS_ASCII(writer.buffer))
4849 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004853 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 } else {
4855 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004856 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 }
4858
4859 switch (ch) {
4860 case 0:
4861 if (s == end || consumed)
4862 goto End;
4863 errmsg = "unexpected end of data";
4864 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004865 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 break;
4867 case 1:
4868 errmsg = "invalid start byte";
4869 startinpos = s - starts;
4870 endinpos = startinpos + 1;
4871 break;
4872 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004873 case 3:
4874 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875 errmsg = "invalid continuation byte";
4876 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004877 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 break;
4879 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004880 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 goto onError;
4882 continue;
4883 }
4884
Victor Stinner1d65d912015-10-05 13:43:50 +02004885 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004886 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004887
4888 switch (error_handler) {
4889 case _Py_ERROR_IGNORE:
4890 s += (endinpos - startinpos);
4891 break;
4892
4893 case _Py_ERROR_REPLACE:
4894 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4895 goto onError;
4896 s += (endinpos - startinpos);
4897 break;
4898
4899 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004900 {
4901 Py_ssize_t i;
4902
Victor Stinner1d65d912015-10-05 13:43:50 +02004903 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4904 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004905 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004906 ch = (Py_UCS4)(unsigned char)(starts[i]);
4907 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4908 ch + 0xdc00);
4909 writer.pos++;
4910 }
4911 s += (endinpos - startinpos);
4912 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004913 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004914
4915 default:
4916 if (unicode_decode_call_errorhandler_writer(
4917 errors, &error_handler_obj,
4918 "utf-8", errmsg,
4919 &starts, &end, &startinpos, &endinpos, &exc, &s,
4920 &writer))
4921 goto onError;
4922 }
Victor Stinner785938e2011-12-11 20:09:03 +01004923 }
4924
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 if (consumed)
4927 *consumed = s - starts;
4928
Victor Stinner1d65d912015-10-05 13:43:50 +02004929 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932
4933onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004934 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004936 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004938}
4939
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004940
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004941/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4942 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004943
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004944 On success, write a pointer to a newly allocated wide character string into
4945 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4946 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004947
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004948 On memory allocation failure, return -1.
4949
4950 On decoding error (if surrogateescape is zero), return -2. If wlen is
4951 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4952 is not NULL, write the decoding error message into *reason. */
4953int
4954_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004955 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004956{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004957 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004958 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 wchar_t *unicode;
4960 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004961
Victor Stinner3d4226a2018-08-29 22:21:32 +02004962 int surrogateescape = 0;
4963 int surrogatepass = 0;
4964 switch (errors)
4965 {
4966 case _Py_ERROR_STRICT:
4967 break;
4968 case _Py_ERROR_SURROGATEESCAPE:
4969 surrogateescape = 1;
4970 break;
4971 case _Py_ERROR_SURROGATEPASS:
4972 surrogatepass = 1;
4973 break;
4974 default:
4975 return -3;
4976 }
4977
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004978 /* Note: size will always be longer than the resulting Unicode
4979 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004980 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004981 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004982 }
4983
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004984 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004985 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004986 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004987 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988
4989 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 if (ch > 0xFF) {
5000#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005001 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005003 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005004 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5006 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5007#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005008 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005010 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005012 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005013
5014 if (surrogateescape) {
5015 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5016 }
5017 else {
5018 /* Is it a valid three-byte code? */
5019 if (surrogatepass
5020 && (e - s) >= 3
5021 && (s[0] & 0xf0) == 0xe0
5022 && (s[1] & 0xc0) == 0x80
5023 && (s[2] & 0xc0) == 0x80)
5024 {
5025 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5026 s += 3;
5027 unicode[outpos++] = ch;
5028 }
5029 else {
5030 PyMem_RawFree(unicode );
5031 if (reason != NULL) {
5032 switch (ch) {
5033 case 0:
5034 *reason = "unexpected end of data";
5035 break;
5036 case 1:
5037 *reason = "invalid start byte";
5038 break;
5039 /* 2, 3, 4 */
5040 default:
5041 *reason = "invalid continuation byte";
5042 break;
5043 }
5044 }
5045 if (wlen != NULL) {
5046 *wlen = s - orig_s;
5047 }
5048 return -2;
5049 }
5050 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005051 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005052 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005054 if (wlen) {
5055 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005056 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005057 *wstr = unicode;
5058 return 0;
5059}
5060
5061wchar_t*
5062_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5063{
5064 wchar_t *wstr;
5065 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5066 if (res != 0) {
5067 return NULL;
5068 }
5069 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005070}
5071
Antoine Pitrouab868312009-01-10 15:40:25 +00005072
Victor Stinnere47e6982017-12-21 15:45:16 +01005073/* UTF-8 encoder using the surrogateescape error handler .
5074
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005075 On success, return 0 and write the newly allocated character string (use
5076 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005077
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005078 On encoding failure, return -2 and write the position of the invalid
5079 surrogate character into *error_pos (if error_pos is set) and the decoding
5080 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005081
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005082 On memory allocation failure, return -1. */
5083int
5084_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005085 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005086{
5087 const Py_ssize_t max_char_size = 4;
5088 Py_ssize_t len = wcslen(text);
5089
5090 assert(len >= 0);
5091
Victor Stinner3d4226a2018-08-29 22:21:32 +02005092 int surrogateescape = 0;
5093 int surrogatepass = 0;
5094 switch (errors)
5095 {
5096 case _Py_ERROR_STRICT:
5097 break;
5098 case _Py_ERROR_SURROGATEESCAPE:
5099 surrogateescape = 1;
5100 break;
5101 case _Py_ERROR_SURROGATEPASS:
5102 surrogatepass = 1;
5103 break;
5104 default:
5105 return -3;
5106 }
5107
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005108 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5109 return -1;
5110 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005111 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005112 if (raw_malloc) {
5113 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005114 }
5115 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005116 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 }
5118 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005119 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005120 }
5121
5122 char *p = bytes;
5123 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005124 for (i = 0; i < len; ) {
5125 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005126 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005127 i++;
5128#if Py_UNICODE_SIZE == 2
5129 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5130 && i < len
5131 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5132 {
5133 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5134 i++;
5135 }
5136#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005137
5138 if (ch < 0x80) {
5139 /* Encode ASCII */
5140 *p++ = (char) ch;
5141
5142 }
5143 else if (ch < 0x0800) {
5144 /* Encode Latin-1 */
5145 *p++ = (char)(0xc0 | (ch >> 6));
5146 *p++ = (char)(0x80 | (ch & 0x3f));
5147 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005148 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005149 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005151 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005152 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005153 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154 if (reason != NULL) {
5155 *reason = "encoding error";
5156 }
5157 if (raw_malloc) {
5158 PyMem_RawFree(bytes);
5159 }
5160 else {
5161 PyMem_Free(bytes);
5162 }
5163 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005164 }
5165 *p++ = (char)(ch & 0xff);
5166 }
5167 else if (ch < 0x10000) {
5168 *p++ = (char)(0xe0 | (ch >> 12));
5169 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5170 *p++ = (char)(0x80 | (ch & 0x3f));
5171 }
5172 else { /* ch >= 0x10000 */
5173 assert(ch <= MAX_UNICODE);
5174 /* Encode UCS4 Unicode ordinals */
5175 *p++ = (char)(0xf0 | (ch >> 18));
5176 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5177 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5178 *p++ = (char)(0x80 | (ch & 0x3f));
5179 }
5180 }
5181 *p++ = '\0';
5182
5183 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005184 char *bytes2;
5185 if (raw_malloc) {
5186 bytes2 = PyMem_RawRealloc(bytes, final_size);
5187 }
5188 else {
5189 bytes2 = PyMem_Realloc(bytes, final_size);
5190 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005191 if (bytes2 == NULL) {
5192 if (error_pos != NULL) {
5193 *error_pos = (size_t)-1;
5194 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005195 if (raw_malloc) {
5196 PyMem_RawFree(bytes);
5197 }
5198 else {
5199 PyMem_Free(bytes);
5200 }
5201 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005202 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005203 *str = bytes2;
5204 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005205}
5206
5207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005208/* Primary internal function which creates utf8 encoded bytes objects.
5209
5210 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005211 and allocate exactly as much space needed at the end. Else allocate the
5212 maximum possible needed (4 result bytes per Unicode character), and return
5213 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005214*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005215PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005216_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Victor Stinner6099a032011-12-18 14:22:26 +01005218 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005219 void *data;
5220 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005222 if (!PyUnicode_Check(unicode)) {
5223 PyErr_BadArgument();
5224 return NULL;
5225 }
5226
5227 if (PyUnicode_READY(unicode) == -1)
5228 return NULL;
5229
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005230 if (PyUnicode_UTF8(unicode))
5231 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5232 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005233
5234 kind = PyUnicode_KIND(unicode);
5235 data = PyUnicode_DATA(unicode);
5236 size = PyUnicode_GET_LENGTH(unicode);
5237
Benjamin Petersonead6b532011-12-20 17:23:42 -06005238 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005239 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005240 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005241 case PyUnicode_1BYTE_KIND:
5242 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5243 assert(!PyUnicode_IS_ASCII(unicode));
5244 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5245 case PyUnicode_2BYTE_KIND:
5246 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5247 case PyUnicode_4BYTE_KIND:
5248 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250}
5251
Alexander Belopolsky40018472011-02-26 01:02:56 +00005252PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005253PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5254 Py_ssize_t size,
5255 const char *errors)
5256{
5257 PyObject *v, *unicode;
5258
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005259 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005260 if (unicode == NULL)
5261 return NULL;
5262 v = _PyUnicode_AsUTF8String(unicode, errors);
5263 Py_DECREF(unicode);
5264 return v;
5265}
5266
5267PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005268PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005270 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271}
5272
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273/* --- UTF-32 Codec ------------------------------------------------------- */
5274
5275PyObject *
5276PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 Py_ssize_t size,
5278 const char *errors,
5279 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280{
5281 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5282}
5283
5284PyObject *
5285PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 Py_ssize_t size,
5287 const char *errors,
5288 int *byteorder,
5289 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290{
5291 const char *starts = s;
5292 Py_ssize_t startinpos;
5293 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005294 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005295 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005297 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005298 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005299 PyObject *errorHandler = NULL;
5300 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005301
Walter Dörwald41980ca2007-08-16 21:55:45 +00005302 q = (unsigned char *)s;
5303 e = q + size;
5304
5305 if (byteorder)
5306 bo = *byteorder;
5307
5308 /* Check for BOM marks (U+FEFF) in the input and adjust current
5309 byte order setting accordingly. In native mode, the leading BOM
5310 mark is skipped, in all other modes, it is copied to the output
5311 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005313 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (bom == 0x0000FEFF) {
5315 bo = -1;
5316 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 else if (bom == 0xFFFE0000) {
5319 bo = 1;
5320 q += 4;
5321 }
5322 if (byteorder)
5323 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005324 }
5325
Victor Stinnere64322e2012-10-30 23:12:47 +01005326 if (q == e) {
5327 if (consumed)
5328 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005329 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005330 }
5331
Victor Stinnere64322e2012-10-30 23:12:47 +01005332#ifdef WORDS_BIGENDIAN
5333 le = bo < 0;
5334#else
5335 le = bo <= 0;
5336#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005338
Victor Stinner8f674cc2013-04-17 23:02:17 +02005339 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005340 writer.min_length = (e - q + 3) / 4;
5341 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005342 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005343
Victor Stinnere64322e2012-10-30 23:12:47 +01005344 while (1) {
5345 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005346 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005347
Victor Stinnere64322e2012-10-30 23:12:47 +01005348 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005349 enum PyUnicode_Kind kind = writer.kind;
5350 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005351 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005353 if (le) {
5354 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005355 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005356 if (ch > maxch)
5357 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005358 if (kind != PyUnicode_1BYTE_KIND &&
5359 Py_UNICODE_IS_SURROGATE(ch))
5360 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005362 q += 4;
5363 } while (q <= last);
5364 }
5365 else {
5366 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005367 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005368 if (ch > maxch)
5369 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005370 if (kind != PyUnicode_1BYTE_KIND &&
5371 Py_UNICODE_IS_SURROGATE(ch))
5372 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005374 q += 4;
5375 } while (q <= last);
5376 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005378 }
5379
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005381 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005382 startinpos = ((const char *)q) - starts;
5383 endinpos = startinpos + 4;
5384 }
5385 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005386 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005388 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005390 startinpos = ((const char *)q) - starts;
5391 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005393 else {
5394 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005395 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005396 goto onError;
5397 q += 4;
5398 continue;
5399 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005400 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005401 startinpos = ((const char *)q) - starts;
5402 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005404
5405 /* The remaining input chars are ignored if the callback
5406 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005407 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005409 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005411 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005413 }
5414
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417
Walter Dörwald41980ca2007-08-16 21:55:45 +00005418 Py_XDECREF(errorHandler);
5419 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005420 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005423 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424 Py_XDECREF(errorHandler);
5425 Py_XDECREF(exc);
5426 return NULL;
5427}
5428
5429PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430_PyUnicode_EncodeUTF32(PyObject *str,
5431 const char *errors,
5432 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 enum PyUnicode_Kind kind;
5435 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005437 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005438 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005439#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005441#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005443#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 PyObject *errorHandler = NULL;
5447 PyObject *exc = NULL;
5448 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005449
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 if (!PyUnicode_Check(str)) {
5451 PyErr_BadArgument();
5452 return NULL;
5453 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005454 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455 return NULL;
5456 kind = PyUnicode_KIND(str);
5457 data = PyUnicode_DATA(str);
5458 len = PyUnicode_GET_LENGTH(str);
5459
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005461 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005462 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005463 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005464 if (v == NULL)
5465 return NULL;
5466
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005467 /* output buffer is 4-bytes aligned */
5468 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005469 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005474
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005475 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005476 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 else
5480 encoding = "utf-32";
5481
5482 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5484 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005485 }
5486
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005487 pos = 0;
5488 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005490
5491 if (kind == PyUnicode_2BYTE_KIND) {
5492 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5493 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 else {
5496 assert(kind == PyUnicode_4BYTE_KIND);
5497 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5498 &out, native_ordering);
5499 }
5500 if (pos == len)
5501 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 rep = unicode_encode_call_errorhandler(
5504 errors, &errorHandler,
5505 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005506 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 if (!rep)
5508 goto error;
5509
5510 if (PyBytes_Check(rep)) {
5511 repsize = PyBytes_GET_SIZE(rep);
5512 if (repsize & 3) {
5513 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005514 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 "surrogates not allowed");
5516 goto error;
5517 }
5518 moreunits = repsize / 4;
5519 }
5520 else {
5521 assert(PyUnicode_Check(rep));
5522 if (PyUnicode_READY(rep) < 0)
5523 goto error;
5524 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5525 if (!PyUnicode_IS_ASCII(rep)) {
5526 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005527 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005528 "surrogates not allowed");
5529 goto error;
5530 }
5531 }
5532
5533 /* four bytes are reserved for each surrogate */
5534 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005535 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005536 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005537 /* integer overflow */
5538 PyErr_NoMemory();
5539 goto error;
5540 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005541 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005542 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005543 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005544 }
5545
5546 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005547 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005548 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005549 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005550 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005551 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5552 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 }
5554
5555 Py_CLEAR(rep);
5556 }
5557
5558 /* Cut back to size actually needed. This is necessary for, for example,
5559 encoding of a string containing isolated surrogates and the 'ignore'
5560 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005561 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005562 if (nsize != PyBytes_GET_SIZE(v))
5563 _PyBytes_Resize(&v, nsize);
5564 Py_XDECREF(errorHandler);
5565 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005566 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005567 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 error:
5569 Py_XDECREF(rep);
5570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
5572 Py_XDECREF(v);
5573 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005574}
5575
Alexander Belopolsky40018472011-02-26 01:02:56 +00005576PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005577PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5578 Py_ssize_t size,
5579 const char *errors,
5580 int byteorder)
5581{
5582 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005583 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005584 if (tmp == NULL)
5585 return NULL;
5586 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5587 Py_DECREF(tmp);
5588 return result;
5589}
5590
5591PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005592PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005593{
Victor Stinnerb960b342011-11-20 19:12:52 +01005594 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595}
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597/* --- UTF-16 Codec ------------------------------------------------------- */
5598
Tim Peters772747b2001-08-09 22:21:55 +00005599PyObject *
5600PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 Py_ssize_t size,
5602 const char *errors,
5603 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Walter Dörwald69652032004-09-07 20:24:22 +00005605 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5606}
5607
5608PyObject *
5609PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 Py_ssize_t size,
5611 const char *errors,
5612 int *byteorder,
5613 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005616 Py_ssize_t startinpos;
5617 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005620 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005621 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005622 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 PyObject *errorHandler = NULL;
5624 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005625 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
Tim Peters772747b2001-08-09 22:21:55 +00005627 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
5630 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005631 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005633 /* Check for BOM marks (U+FEFF) in the input and adjust current
5634 byte order setting accordingly. In native mode, the leading BOM
5635 mark is skipped, in all other modes, it is copied to the output
5636 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005637 if (bo == 0 && size >= 2) {
5638 const Py_UCS4 bom = (q[1] << 8) | q[0];
5639 if (bom == 0xFEFF) {
5640 q += 2;
5641 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 else if (bom == 0xFFFE) {
5644 q += 2;
5645 bo = 1;
5646 }
5647 if (byteorder)
5648 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 if (q == e) {
5652 if (consumed)
5653 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005654 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005655 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005656
Christian Heimes743e0cd2012-10-17 23:52:17 +02005657#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005658 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005660#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005663#endif
Tim Peters772747b2001-08-09 22:21:55 +00005664
Antoine Pitrou63065d72012-05-15 23:48:04 +02005665 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005666 character count normally. Error handler will take care of
5667 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005668 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005669 writer.min_length = (e - q + 1) / 2;
5670 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005671 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005672
Antoine Pitrou63065d72012-05-15 23:48:04 +02005673 while (1) {
5674 Py_UCS4 ch = 0;
5675 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005677 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005679 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005680 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005681 native_ordering);
5682 else
5683 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005685 native_ordering);
5686 } else if (kind == PyUnicode_2BYTE_KIND) {
5687 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005689 native_ordering);
5690 } else {
5691 assert(kind == PyUnicode_4BYTE_KIND);
5692 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005694 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005695 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697
Antoine Pitrou63065d72012-05-15 23:48:04 +02005698 switch (ch)
5699 {
5700 case 0:
5701 /* remaining byte at the end? (size should be even) */
5702 if (q == e || consumed)
5703 goto End;
5704 errmsg = "truncated data";
5705 startinpos = ((const char *)q) - starts;
5706 endinpos = ((const char *)e) - starts;
5707 break;
5708 /* The remaining input chars are ignored if the callback
5709 chooses to skip the input */
5710 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005711 q -= 2;
5712 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005713 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005714 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005715 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005716 endinpos = ((const char *)e) - starts;
5717 break;
5718 case 2:
5719 errmsg = "illegal encoding";
5720 startinpos = ((const char *)q) - 2 - starts;
5721 endinpos = startinpos + 2;
5722 break;
5723 case 3:
5724 errmsg = "illegal UTF-16 surrogate";
5725 startinpos = ((const char *)q) - 4 - starts;
5726 endinpos = startinpos + 2;
5727 break;
5728 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005729 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005730 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 continue;
5732 }
5733
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005734 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005735 errors,
5736 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005738 &starts,
5739 (const char **)&e,
5740 &startinpos,
5741 &endinpos,
5742 &exc,
5743 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005744 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 }
5747
Antoine Pitrou63065d72012-05-15 23:48:04 +02005748End:
Walter Dörwald69652032004-09-07 20:24:22 +00005749 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 Py_XDECREF(errorHandler);
5753 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005754 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005757 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
5761}
5762
Tim Peters772747b2001-08-09 22:21:55 +00005763PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005764_PyUnicode_EncodeUTF16(PyObject *str,
5765 const char *errors,
5766 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005768 enum PyUnicode_Kind kind;
5769 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005770 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005771 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005772 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005773 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005774#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005775 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005776#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005777 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005778#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 const char *encoding;
5780 Py_ssize_t nsize, pos;
5781 PyObject *errorHandler = NULL;
5782 PyObject *exc = NULL;
5783 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005784
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 if (!PyUnicode_Check(str)) {
5786 PyErr_BadArgument();
5787 return NULL;
5788 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005789 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 return NULL;
5791 kind = PyUnicode_KIND(str);
5792 data = PyUnicode_DATA(str);
5793 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005794
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005796 if (kind == PyUnicode_4BYTE_KIND) {
5797 const Py_UCS4 *in = (const Py_UCS4 *)data;
5798 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005799 while (in < end) {
5800 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005802 }
5803 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005804 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005805 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005807 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 nsize = len + pairs + (byteorder == 0);
5809 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005810 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005814 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005815 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005816 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005817 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005818 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005819 }
5820 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005821 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005822 }
Tim Peters772747b2001-08-09 22:21:55 +00005823
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 if (kind == PyUnicode_1BYTE_KIND) {
5825 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5826 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005827 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005828
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005829 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005831 }
5832 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005834 }
5835 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005837 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838
5839 pos = 0;
5840 while (pos < len) {
5841 Py_ssize_t repsize, moreunits;
5842
5843 if (kind == PyUnicode_2BYTE_KIND) {
5844 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5845 &out, native_ordering);
5846 }
5847 else {
5848 assert(kind == PyUnicode_4BYTE_KIND);
5849 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5850 &out, native_ordering);
5851 }
5852 if (pos == len)
5853 break;
5854
5855 rep = unicode_encode_call_errorhandler(
5856 errors, &errorHandler,
5857 encoding, "surrogates not allowed",
5858 str, &exc, pos, pos + 1, &pos);
5859 if (!rep)
5860 goto error;
5861
5862 if (PyBytes_Check(rep)) {
5863 repsize = PyBytes_GET_SIZE(rep);
5864 if (repsize & 1) {
5865 raise_encode_exception(&exc, encoding,
5866 str, pos - 1, pos,
5867 "surrogates not allowed");
5868 goto error;
5869 }
5870 moreunits = repsize / 2;
5871 }
5872 else {
5873 assert(PyUnicode_Check(rep));
5874 if (PyUnicode_READY(rep) < 0)
5875 goto error;
5876 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5877 if (!PyUnicode_IS_ASCII(rep)) {
5878 raise_encode_exception(&exc, encoding,
5879 str, pos - 1, pos,
5880 "surrogates not allowed");
5881 goto error;
5882 }
5883 }
5884
5885 /* two bytes are reserved for each surrogate */
5886 if (moreunits > 1) {
5887 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005888 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 /* integer overflow */
5890 PyErr_NoMemory();
5891 goto error;
5892 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005893 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005894 goto error;
5895 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5896 }
5897
5898 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005899 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 out += moreunits;
5901 } else /* rep is unicode */ {
5902 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5903 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5904 &out, native_ordering);
5905 }
5906
5907 Py_CLEAR(rep);
5908 }
5909
5910 /* Cut back to size actually needed. This is necessary for, for example,
5911 encoding of a string containing isolated surrogates and the 'ignore' handler
5912 is used. */
5913 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5914 if (nsize != PyBytes_GET_SIZE(v))
5915 _PyBytes_Resize(&v, nsize);
5916 Py_XDECREF(errorHandler);
5917 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005920 error:
5921 Py_XDECREF(rep);
5922 Py_XDECREF(errorHandler);
5923 Py_XDECREF(exc);
5924 Py_XDECREF(v);
5925 return NULL;
5926#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927}
5928
Alexander Belopolsky40018472011-02-26 01:02:56 +00005929PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5931 Py_ssize_t size,
5932 const char *errors,
5933 int byteorder)
5934{
5935 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005936 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 if (tmp == NULL)
5938 return NULL;
5939 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5940 Py_DECREF(tmp);
5941 return result;
5942}
5943
5944PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005945PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005947 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948}
5949
5950/* --- Unicode Escape Codec ----------------------------------------------- */
5951
Fredrik Lundh06d12682001-01-24 07:59:11 +00005952static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005953
Alexander Belopolsky40018472011-02-26 01:02:56 +00005954PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005955_PyUnicode_DecodeUnicodeEscape(const char *s,
5956 Py_ssize_t size,
5957 const char *errors,
5958 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005961 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 PyObject *errorHandler = NULL;
5964 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965
Eric V. Smith42454af2016-10-31 09:22:08 -04005966 // so we can remember if we've seen an invalid escape char or not
5967 *first_invalid_escape = NULL;
5968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 }
5972 /* Escaped strings will always be longer than the resulting
5973 Unicode string, so we start with size here and then reduce the
5974 length after conversion to the true value.
5975 (but if the error callback returns a long replacement string
5976 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005977 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 writer.min_length = size;
5979 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5980 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005981 }
5982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 end = s + size;
5984 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 unsigned char c = (unsigned char) *s++;
5986 Py_UCS4 ch;
5987 int count;
5988 Py_ssize_t startinpos;
5989 Py_ssize_t endinpos;
5990 const char *message;
5991
5992#define WRITE_ASCII_CHAR(ch) \
5993 do { \
5994 assert(ch <= 127); \
5995 assert(writer.pos < writer.size); \
5996 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5997 } while(0)
5998
5999#define WRITE_CHAR(ch) \
6000 do { \
6001 if (ch <= writer.maxchar) { \
6002 assert(writer.pos < writer.size); \
6003 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6004 } \
6005 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6006 goto onError; \
6007 } \
6008 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
6010 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 if (c != '\\') {
6012 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 continue;
6014 }
6015
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 if (s >= end) {
6019 message = "\\ at end of string";
6020 goto error;
6021 }
6022 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006025 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 case '\n': continue;
6029 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6030 case '\'': WRITE_ASCII_CHAR('\''); continue;
6031 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6032 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006033 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6035 case 't': WRITE_ASCII_CHAR('\t'); continue;
6036 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6037 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006038 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006039 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006040 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006041 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 case '0': case '1': case '2': case '3':
6045 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006046 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006047 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006048 ch = (ch<<3) + *s++ - '0';
6049 if (s < end && '0' <= *s && *s <= '7') {
6050 ch = (ch<<3) + *s++ - '0';
6051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006053 WRITE_CHAR(ch);
6054 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* hex escapes */
6057 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 message = "truncated \\xXX escape";
6061 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 message = "truncated \\uXXXX escape";
6067 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006070 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 message = "truncated \\UXXXXXXXX escape";
6073 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006075 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 ch <<= 4;
6077 if (c >= '0' && c <= '9') {
6078 ch += c - '0';
6079 }
6080 else if (c >= 'a' && c <= 'f') {
6081 ch += c - ('a' - 10);
6082 }
6083 else if (c >= 'A' && c <= 'F') {
6084 ch += c - ('A' - 10);
6085 }
6086 else {
6087 break;
6088 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006089 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006091 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 }
6093
6094 /* when we get here, ch is a 32-bit unicode character */
6095 if (ch > MAX_UNICODE) {
6096 message = "illegal Unicode character";
6097 goto error;
6098 }
6099
6100 WRITE_CHAR(ch);
6101 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006102
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006105 if (ucnhash_CAPI == NULL) {
6106 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006107 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6108 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006109 if (ucnhash_CAPI == NULL) {
6110 PyErr_SetString(
6111 PyExc_UnicodeError,
6112 "\\N escapes not supported (can't load unicodedata module)"
6113 );
6114 goto onError;
6115 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006116 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006117
6118 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006119 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006120 const char *start = ++s;
6121 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006122 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006123 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006124 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 namelen = s - start;
6126 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006127 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006128 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 ch = 0xffffffff; /* in case 'getcode' messes up */
6130 if (namelen <= INT_MAX &&
6131 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6132 &ch, 0)) {
6133 assert(ch <= MAX_UNICODE);
6134 WRITE_CHAR(ch);
6135 continue;
6136 }
6137 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006138 }
6139 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006140 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006141
6142 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006143 if (*first_invalid_escape == NULL) {
6144 *first_invalid_escape = s-1; /* Back up one char, since we've
6145 already incremented s. */
6146 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 WRITE_ASCII_CHAR('\\');
6148 WRITE_CHAR(c);
6149 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006151
6152 error:
6153 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006155 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006156 errors, &errorHandler,
6157 "unicodeescape", message,
6158 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006160 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006162 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006163
6164#undef WRITE_ASCII_CHAR
6165#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006167
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006168 Py_XDECREF(errorHandler);
6169 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006170 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006171
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006173 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 Py_XDECREF(errorHandler);
6175 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 return NULL;
6177}
6178
Eric V. Smith42454af2016-10-31 09:22:08 -04006179PyObject *
6180PyUnicode_DecodeUnicodeEscape(const char *s,
6181 Py_ssize_t size,
6182 const char *errors)
6183{
6184 const char *first_invalid_escape;
6185 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6186 &first_invalid_escape);
6187 if (result == NULL)
6188 return NULL;
6189 if (first_invalid_escape != NULL) {
6190 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6191 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006192 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006193 Py_DECREF(result);
6194 return NULL;
6195 }
6196 }
6197 return result;
6198}
6199
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006200/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006208 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Ezio Melottie7f90372012-10-05 03:33:31 +03006212 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006213 escape.
6214
Ezio Melottie7f90372012-10-05 03:33:31 +03006215 For UCS1 strings it's '\xxx', 4 bytes per source character.
6216 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6217 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006218 */
6219
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220 if (!PyUnicode_Check(unicode)) {
6221 PyErr_BadArgument();
6222 return NULL;
6223 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 }
Victor Stinner358af132015-10-12 22:36:57 +02006227
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 if (len == 0) {
6230 return PyBytes_FromStringAndSize(NULL, 0);
6231 }
6232
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233 kind = PyUnicode_KIND(unicode);
6234 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6236 bytes, and 1 byte characters 4. */
6237 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006238 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 return PyErr_NoMemory();
6240 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006241 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 if (repr == NULL) {
6243 return NULL;
6244 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006245
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006247 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006248 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 /* U+0000-U+00ff range */
6251 if (ch < 0x100) {
6252 if (ch >= ' ' && ch < 127) {
6253 if (ch != '\\') {
6254 /* Copy printable US ASCII as-is */
6255 *p++ = (char) ch;
6256 }
6257 /* Escape backslashes */
6258 else {
6259 *p++ = '\\';
6260 *p++ = '\\';
6261 }
6262 }
Victor Stinner358af132015-10-12 22:36:57 +02006263
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 /* Map special whitespace to '\t', \n', '\r' */
6265 else if (ch == '\t') {
6266 *p++ = '\\';
6267 *p++ = 't';
6268 }
6269 else if (ch == '\n') {
6270 *p++ = '\\';
6271 *p++ = 'n';
6272 }
6273 else if (ch == '\r') {
6274 *p++ = '\\';
6275 *p++ = 'r';
6276 }
6277
6278 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6279 else {
6280 *p++ = '\\';
6281 *p++ = 'x';
6282 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6283 *p++ = Py_hexdigits[ch & 0x000F];
6284 }
Tim Petersced69f82003-09-16 20:30:58 +00006285 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006286 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 *p++ = '\\';
6289 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006290 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6291 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6292 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6293 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6296 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006297
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 /* Make sure that the first two digits are zero */
6299 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006300 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 *p++ = 'U';
6302 *p++ = '0';
6303 *p++ = '0';
6304 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6305 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6306 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6307 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6308 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6309 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 assert(p - PyBytes_AS_STRING(repr) > 0);
6314 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6315 return NULL;
6316 }
6317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6322 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006324 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006325 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006328 }
6329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 result = PyUnicode_AsUnicodeEscapeString(tmp);
6331 Py_DECREF(tmp);
6332 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333}
6334
6335/* --- Raw Unicode Escape Codec ------------------------------------------- */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
6338PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006339 Py_ssize_t size,
6340 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 PyObject *errorHandler = NULL;
6346 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006347
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006349 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006351
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 /* Escaped strings will always be longer than the resulting
6353 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 length after conversion to the true value. (But decoding error
6355 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006356 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 writer.min_length = size;
6358 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6359 goto onError;
6360 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006361
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 end = s + size;
6363 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006364 unsigned char c = (unsigned char) *s++;
6365 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006366 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006367 Py_ssize_t startinpos;
6368 Py_ssize_t endinpos;
6369 const char *message;
6370
6371#define WRITE_CHAR(ch) \
6372 do { \
6373 if (ch <= writer.maxchar) { \
6374 assert(writer.pos < writer.size); \
6375 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6376 } \
6377 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6378 goto onError; \
6379 } \
6380 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 if (c != '\\' || s >= end) {
6384 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006387
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 c = (unsigned char) *s++;
6389 if (c == 'u') {
6390 count = 4;
6391 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006393 else if (c == 'U') {
6394 count = 8;
6395 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006396 }
6397 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 assert(writer.pos < writer.size);
6399 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6400 WRITE_CHAR(c);
6401 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006402 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 startinpos = s - starts - 2;
6404
6405 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6406 for (ch = 0; count && s < end; ++s, --count) {
6407 c = (unsigned char)*s;
6408 ch <<= 4;
6409 if (c >= '0' && c <= '9') {
6410 ch += c - '0';
6411 }
6412 else if (c >= 'a' && c <= 'f') {
6413 ch += c - ('a' - 10);
6414 }
6415 else if (c >= 'A' && c <= 'F') {
6416 ch += c - ('A' - 10);
6417 }
6418 else {
6419 break;
6420 }
6421 }
6422 if (!count) {
6423 if (ch <= MAX_UNICODE) {
6424 WRITE_CHAR(ch);
6425 continue;
6426 }
6427 message = "\\Uxxxxxxxx out of range";
6428 }
6429
6430 endinpos = s-starts;
6431 writer.min_length = end - s + writer.pos;
6432 if (unicode_decode_call_errorhandler_writer(
6433 errors, &errorHandler,
6434 "rawunicodeescape", message,
6435 &starts, &end, &startinpos, &endinpos, &exc, &s,
6436 &writer)) {
6437 goto onError;
6438 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006439 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006440
6441#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_XDECREF(errorHandler);
6444 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006445 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006446
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455
Alexander Belopolsky40018472011-02-26 01:02:56 +00006456PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006457PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458{
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006462 int kind;
6463 void *data;
6464 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006466 if (!PyUnicode_Check(unicode)) {
6467 PyErr_BadArgument();
6468 return NULL;
6469 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006471 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006473 kind = PyUnicode_KIND(unicode);
6474 data = PyUnicode_DATA(unicode);
6475 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 if (kind == PyUnicode_1BYTE_KIND) {
6477 return PyBytes_FromStringAndSize(data, len);
6478 }
Victor Stinner0e368262011-11-10 20:12:49 +01006479
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6481 bytes, and 1 byte characters 4. */
6482 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 if (len > PY_SSIZE_T_MAX / expandsize) {
6485 return PyErr_NoMemory();
6486 }
6487 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6488 if (repr == NULL) {
6489 return NULL;
6490 }
6491 if (len == 0) {
6492 return repr;
6493 }
6494
6495 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 for (pos = 0; pos < len; pos++) {
6497 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006498
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6500 if (ch < 0x100) {
6501 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006502 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006503 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 *p++ = '\\';
6506 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006507 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6508 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6509 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6510 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6513 else {
6514 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6515 *p++ = '\\';
6516 *p++ = 'U';
6517 *p++ = '0';
6518 *p++ = '0';
6519 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6520 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6521 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6522 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6523 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6524 *p++ = Py_hexdigits[ch & 15];
6525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006527
Victor Stinner62ec3312016-09-06 17:04:34 -07006528 assert(p > PyBytes_AS_STRING(repr));
6529 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6530 return NULL;
6531 }
6532 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533}
6534
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006536PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6537 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006539 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006540 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006541 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006542 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006543 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6544 Py_DECREF(tmp);
6545 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546}
6547
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548/* --- Unicode Internal Codec ------------------------------------------- */
6549
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550PyObject *
6551_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006552 Py_ssize_t size,
6553 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554{
6555 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006556 Py_ssize_t startinpos;
6557 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006558 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006559 const char *end;
6560 const char *reason;
6561 PyObject *errorHandler = NULL;
6562 PyObject *exc = NULL;
6563
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006564 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006565 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 1))
6567 return NULL;
6568
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006569 if (size < 0) {
6570 PyErr_BadInternalCall();
6571 return NULL;
6572 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006573 if (size == 0)
6574 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006575
Victor Stinner8f674cc2013-04-17 23:02:17 +02006576 _PyUnicodeWriter_Init(&writer);
6577 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6578 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006580 }
6581 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006582
Victor Stinner8f674cc2013-04-17 23:02:17 +02006583 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006584 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006585 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006586 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006587 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006588 endinpos = end-starts;
6589 reason = "truncated input";
6590 goto error;
6591 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006592 /* We copy the raw representation one byte at a time because the
6593 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006594 ((char *) &uch)[0] = s[0];
6595 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006596#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006597 ((char *) &uch)[2] = s[2];
6598 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006599#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006600 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006601#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006602 /* We have to sanity check the raw data, otherwise doom looms for
6603 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006604 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006605 endinpos = s - starts + Py_UNICODE_SIZE;
6606 reason = "illegal code point (> 0x10FFFF)";
6607 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006608 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006609#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006610 s += Py_UNICODE_SIZE;
6611#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006612 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006613 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006614 Py_UNICODE uch2;
6615 ((char *) &uch2)[0] = s[0];
6616 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006617 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006618 {
Victor Stinner551ac952011-11-29 22:58:13 +01006619 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006620 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006621 }
6622 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006623#endif
6624
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006625 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006626 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006627 continue;
6628
6629 error:
6630 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006631 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006632 errors, &errorHandler,
6633 "unicode_internal", reason,
6634 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006635 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006636 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006637 }
6638
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006639 Py_XDECREF(errorHandler);
6640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006641 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006642
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006644 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006645 Py_XDECREF(errorHandler);
6646 Py_XDECREF(exc);
6647 return NULL;
6648}
6649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650/* --- Latin-1 Codec ------------------------------------------------------ */
6651
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652PyObject *
6653PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006654 Py_ssize_t size,
6655 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006658 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static void
6663make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006664 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006665 PyObject *unicode,
6666 Py_ssize_t startpos, Py_ssize_t endpos,
6667 const char *reason)
6668{
6669 if (*exceptionObject == NULL) {
6670 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006672 encoding, unicode, startpos, endpos, reason);
6673 }
6674 else {
6675 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6676 goto onError;
6677 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6678 goto onError;
6679 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6680 goto onError;
6681 return;
6682 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006683 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006684 }
6685}
6686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688static void
6689raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006690 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006691 PyObject *unicode,
6692 Py_ssize_t startpos, Py_ssize_t endpos,
6693 const char *reason)
6694{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006695 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006696 encoding, unicode, startpos, endpos, reason);
6697 if (*exceptionObject != NULL)
6698 PyCodec_StrictErrors(*exceptionObject);
6699}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700
6701/* error handling callback helper:
6702 build arguments, call the callback and check the arguments,
6703 put the result into newpos and return the replacement string, which
6704 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006705static PyObject *
6706unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006707 PyObject **errorHandler,
6708 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006710 Py_ssize_t startpos, Py_ssize_t endpos,
6711 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006713 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 PyObject *restuple;
6716 PyObject *resunicode;
6717
6718 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 }
6723
Benjamin Petersonbac79492012-01-14 13:34:47 -05006724 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006725 return NULL;
6726 len = PyUnicode_GET_LENGTH(unicode);
6727
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006728 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006733 restuple = PyObject_CallFunctionObjArgs(
6734 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006738 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 Py_DECREF(restuple);
6740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006742 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 &resunicode, newpos)) {
6744 Py_DECREF(restuple);
6745 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006747 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6748 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6749 Py_DECREF(restuple);
6750 return NULL;
6751 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 *newpos = len + *newpos;
6754 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006755 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 Py_DECREF(restuple);
6757 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006758 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006759 Py_INCREF(resunicode);
6760 Py_DECREF(restuple);
6761 return resunicode;
6762}
6763
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006766 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006767 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 /* input state */
6770 Py_ssize_t pos=0, size;
6771 int kind;
6772 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 /* pointer into the output */
6774 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006775 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6776 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006777 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006779 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006780 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006781 /* output object */
6782 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783
Benjamin Petersonbac79492012-01-14 13:34:47 -05006784 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 return NULL;
6786 size = PyUnicode_GET_LENGTH(unicode);
6787 kind = PyUnicode_KIND(unicode);
6788 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789 /* allocate enough for a simple encoding without
6790 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006791 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006792 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006793
6794 _PyBytesWriter_Init(&writer);
6795 str = _PyBytesWriter_Alloc(&writer, size);
6796 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006798
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006800 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006803 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006805 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006807 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006809 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006812 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006814
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006815 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006817
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006818 /* Only overallocate the buffer if it's not the last write */
6819 writer.overallocate = (collend < size);
6820
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006822 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006823 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006824
6825 switch (error_handler) {
6826 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006827 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006829
6830 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006831 memset(str, '?', collend - collstart);
6832 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006833 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006834 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 break;
Victor Stinner50149202015-09-22 00:26:54 +02006837
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006838 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006839 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006840 writer.min_size -= (collend - collstart);
6841 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006842 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006843 if (str == NULL)
6844 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006845 pos = collend;
6846 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006847
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006848 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006849 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006850 writer.min_size -= (collend - collstart);
6851 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006852 unicode, collstart, collend);
6853 if (str == NULL)
6854 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006855 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 break;
Victor Stinner50149202015-09-22 00:26:54 +02006857
Victor Stinnerc3713e92015-09-29 12:32:13 +02006858 case _Py_ERROR_SURROGATEESCAPE:
6859 for (i = collstart; i < collend; ++i) {
6860 ch = PyUnicode_READ(kind, data, i);
6861 if (ch < 0xdc80 || 0xdcff < ch) {
6862 /* Not a UTF-8b surrogate */
6863 break;
6864 }
6865 *str++ = (char)(ch - 0xdc00);
6866 ++pos;
6867 }
6868 if (i >= collend)
6869 break;
6870 collstart = pos;
6871 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006872 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006873
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006875 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6876 encoding, reason, unicode, &exc,
6877 collstart, collend, &newpos);
6878 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006880
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006881 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006882 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006883
Victor Stinner6bd525b2015-10-09 13:10:05 +02006884 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006885 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006886 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006887 PyBytes_AS_STRING(rep),
6888 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006889 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006890 else {
6891 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006892
Victor Stinner6bd525b2015-10-09 13:10:05 +02006893 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006895
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006896 if (limit == 256 ?
6897 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6898 !PyUnicode_IS_ASCII(rep))
6899 {
6900 /* Not all characters are smaller than limit */
6901 raise_encode_exception(&exc, encoding, unicode,
6902 collstart, collend, reason);
6903 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006905 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6906 str = _PyBytesWriter_WriteBytes(&writer, str,
6907 PyUnicode_DATA(rep),
6908 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006910 if (str == NULL)
6911 goto onError;
6912
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006914 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006915 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006916
6917 /* If overallocation was disabled, ensure that it was the last
6918 write. Otherwise, we missed an optimization */
6919 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920 }
6921 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006922
Victor Stinner50149202015-09-22 00:26:54 +02006923 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006925 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006926
6927 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006928 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006929 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006930 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006931 Py_XDECREF(exc);
6932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933}
6934
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006935/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006936PyObject *
6937PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006938 Py_ssize_t size,
6939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006941 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006942 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006943 if (unicode == NULL)
6944 return NULL;
6945 result = unicode_encode_ucs1(unicode, errors, 256);
6946 Py_DECREF(unicode);
6947 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006951_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952{
6953 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 PyErr_BadArgument();
6955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957 if (PyUnicode_READY(unicode) == -1)
6958 return NULL;
6959 /* Fast path: if it is a one-byte string, construct
6960 bytes object directly. */
6961 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6962 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6963 PyUnicode_GET_LENGTH(unicode));
6964 /* Non-Latin-1 characters present. Defer to above function to
6965 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006966 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006967}
6968
6969PyObject*
6970PyUnicode_AsLatin1String(PyObject *unicode)
6971{
6972 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
6975/* --- 7-bit ASCII Codec -------------------------------------------------- */
6976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977PyObject *
6978PyUnicode_DecodeASCII(const char *s,
6979 Py_ssize_t size,
6980 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006983 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006984 int kind;
6985 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006986 Py_ssize_t startinpos;
6987 Py_ssize_t endinpos;
6988 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006992 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006996
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006998 if (size == 1 && (unsigned char)s[0] < 128)
6999 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007000
Victor Stinner8f674cc2013-04-17 23:02:17 +02007001 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007002 writer.min_length = size;
7003 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007004 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007008 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 writer.pos = outpos;
7010 if (writer.pos == size)
7011 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007012
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 s += writer.pos;
7014 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007016 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 PyUnicode_WRITE(kind, data, writer.pos, c);
7019 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007021 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023
7024 /* byte outsize range 0x00..0x7f: call the error handler */
7025
7026 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007027 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007028
7029 switch (error_handler)
7030 {
7031 case _Py_ERROR_REPLACE:
7032 case _Py_ERROR_SURROGATEESCAPE:
7033 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007034 but we may switch to UCS2 at the first write */
7035 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7036 goto onError;
7037 kind = writer.kind;
7038 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007039
7040 if (error_handler == _Py_ERROR_REPLACE)
7041 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7042 else
7043 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7044 writer.pos++;
7045 ++s;
7046 break;
7047
7048 case _Py_ERROR_IGNORE:
7049 ++s;
7050 break;
7051
7052 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 startinpos = s-starts;
7054 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007055 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007056 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 "ascii", "ordinal not in range(128)",
7058 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007059 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007061 kind = writer.kind;
7062 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007065 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007067 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007068
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007070 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007071 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 return NULL;
7074}
7075
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007076/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007077PyObject *
7078PyUnicode_EncodeASCII(const Py_UNICODE *p,
7079 Py_ssize_t size,
7080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007083 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007084 if (unicode == NULL)
7085 return NULL;
7086 result = unicode_encode_ucs1(unicode, errors, 128);
7087 Py_DECREF(unicode);
7088 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089}
7090
Alexander Belopolsky40018472011-02-26 01:02:56 +00007091PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007092_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093{
7094 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 PyErr_BadArgument();
7096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 if (PyUnicode_READY(unicode) == -1)
7099 return NULL;
7100 /* Fast path: if it is an ASCII-only string, construct bytes object
7101 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007102 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7104 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007105 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007106}
7107
7108PyObject *
7109PyUnicode_AsASCIIString(PyObject *unicode)
7110{
7111 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
Steve Dowercc16be82016-09-08 10:35:16 -07007114#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007115
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007116/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007117
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007118#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119#define NEED_RETRY
7120#endif
7121
Victor Stinner3a50e702011-10-18 21:21:00 +02007122#ifndef WC_ERR_INVALID_CHARS
7123# define WC_ERR_INVALID_CHARS 0x0080
7124#endif
7125
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007126static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007127code_page_name(UINT code_page, PyObject **obj)
7128{
7129 *obj = NULL;
7130 if (code_page == CP_ACP)
7131 return "mbcs";
7132 if (code_page == CP_UTF7)
7133 return "CP_UTF7";
7134 if (code_page == CP_UTF8)
7135 return "CP_UTF8";
7136
7137 *obj = PyBytes_FromFormat("cp%u", code_page);
7138 if (*obj == NULL)
7139 return NULL;
7140 return PyBytes_AS_STRING(*obj);
7141}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142
Victor Stinner3a50e702011-10-18 21:21:00 +02007143static DWORD
7144decode_code_page_flags(UINT code_page)
7145{
7146 if (code_page == CP_UTF7) {
7147 /* The CP_UTF7 decoder only supports flags=0 */
7148 return 0;
7149 }
7150 else
7151 return MB_ERR_INVALID_CHARS;
7152}
7153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 * Decode a byte string from a Windows code page into unicode object in strict
7156 * mode.
7157 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007158 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7159 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007161static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007162decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007163 wchar_t **buf,
7164 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 const char *in,
7166 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167{
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007169 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171
7172 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 assert(insize > 0);
7174 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7175 if (outsize <= 0)
7176 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007178 /* Extend a wchar_t* buffer */
7179 Py_ssize_t n = *bufsize; /* Get the current length */
7180 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7181 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007183 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184
7185 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7187 if (outsize <= 0)
7188 goto error;
7189 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191error:
7192 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7193 return -2;
7194 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007195 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196}
7197
Victor Stinner3a50e702011-10-18 21:21:00 +02007198/*
7199 * Decode a byte string from a code page into unicode object with an error
7200 * handler.
7201 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007202 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 * UnicodeDecodeError exception and returns -1 on error.
7204 */
7205static int
7206decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007207 wchar_t **buf,
7208 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007210 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007211{
7212 const char *startin = in;
7213 const char *endin = in + size;
7214 const DWORD flags = decode_code_page_flags(code_page);
7215 /* Ideally, we should get reason from FormatMessage. This is the Windows
7216 2000 English version of the message. */
7217 const char *reason = "No mapping for the Unicode character exists "
7218 "in the target code page.";
7219 /* each step cannot decode more than 1 character, but a character can be
7220 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007221 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007222 int insize;
7223 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 PyObject *errorHandler = NULL;
7225 PyObject *exc = NULL;
7226 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007227 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 DWORD err;
7229 int ret = -1;
7230
7231 assert(size > 0);
7232
7233 encoding = code_page_name(code_page, &encoding_obj);
7234 if (encoding == NULL)
7235 return -1;
7236
Victor Stinner7d00cc12014-03-17 23:08:06 +01007237 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7239 UnicodeDecodeError. */
7240 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7241 if (exc != NULL) {
7242 PyCodec_StrictErrors(exc);
7243 Py_CLEAR(exc);
7244 }
7245 goto error;
7246 }
7247
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007248 /* Extend a wchar_t* buffer */
7249 Py_ssize_t n = *bufsize; /* Get the current length */
7250 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7251 PyErr_NoMemory();
7252 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007254 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7255 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007257 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258
7259 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 while (in < endin)
7261 {
7262 /* Decode a character */
7263 insize = 1;
7264 do
7265 {
7266 outsize = MultiByteToWideChar(code_page, flags,
7267 in, insize,
7268 buffer, Py_ARRAY_LENGTH(buffer));
7269 if (outsize > 0)
7270 break;
7271 err = GetLastError();
7272 if (err != ERROR_NO_UNICODE_TRANSLATION
7273 && err != ERROR_INSUFFICIENT_BUFFER)
7274 {
7275 PyErr_SetFromWindowsErr(0);
7276 goto error;
7277 }
7278 insize++;
7279 }
7280 /* 4=maximum length of a UTF-8 sequence */
7281 while (insize <= 4 && (in + insize) <= endin);
7282
7283 if (outsize <= 0) {
7284 Py_ssize_t startinpos, endinpos, outpos;
7285
Victor Stinner7d00cc12014-03-17 23:08:06 +01007286 /* last character in partial decode? */
7287 if (in + insize >= endin && !final)
7288 break;
7289
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 startinpos = in - startin;
7291 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007292 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007293 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 errors, &errorHandler,
7295 encoding, reason,
7296 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007297 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 {
7299 goto error;
7300 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007301 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 }
7303 else {
7304 in += insize;
7305 memcpy(out, buffer, outsize * sizeof(wchar_t));
7306 out += outsize;
7307 }
7308 }
7309
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007310 /* Shrink the buffer */
7311 assert(out - *buf <= *bufsize);
7312 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007313 /* (in - startin) <= size and size is an int */
7314 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007315
7316error:
7317 Py_XDECREF(encoding_obj);
7318 Py_XDECREF(errorHandler);
7319 Py_XDECREF(exc);
7320 return ret;
7321}
7322
Victor Stinner3a50e702011-10-18 21:21:00 +02007323static PyObject *
7324decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007325 const char *s, Py_ssize_t size,
7326 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007328 wchar_t *buf = NULL;
7329 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007330 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 if (code_page < 0) {
7333 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7334 return NULL;
7335 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007336 if (size < 0) {
7337 PyErr_BadInternalCall();
7338 return NULL;
7339 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007340
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 do
7345 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 if (size > INT_MAX) {
7348 chunk_size = INT_MAX;
7349 final = 0;
7350 done = 0;
7351 }
7352 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007354 {
7355 chunk_size = (int)size;
7356 final = (consumed == NULL);
7357 done = 1;
7358 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007361 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007362 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007363 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007364 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007366 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007367 s, chunk_size);
7368 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007369 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007371 errors, final);
7372 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007373
7374 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 return NULL;
7377 }
7378
7379 if (consumed)
7380 *consumed += converted;
7381
7382 s += converted;
7383 size -= converted;
7384 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007385
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007386 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7387 PyMem_Free(buf);
7388 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389}
7390
Alexander Belopolsky40018472011-02-26 01:02:56 +00007391PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007392PyUnicode_DecodeCodePageStateful(int code_page,
7393 const char *s,
7394 Py_ssize_t size,
7395 const char *errors,
7396 Py_ssize_t *consumed)
7397{
7398 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7399}
7400
7401PyObject *
7402PyUnicode_DecodeMBCSStateful(const char *s,
7403 Py_ssize_t size,
7404 const char *errors,
7405 Py_ssize_t *consumed)
7406{
7407 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7408}
7409
7410PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007411PyUnicode_DecodeMBCS(const char *s,
7412 Py_ssize_t size,
7413 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007414{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7416}
7417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418static DWORD
7419encode_code_page_flags(UINT code_page, const char *errors)
7420{
7421 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007422 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 }
7424 else if (code_page == CP_UTF7) {
7425 /* CP_UTF7 only supports flags=0 */
7426 return 0;
7427 }
7428 else {
7429 if (errors != NULL && strcmp(errors, "replace") == 0)
7430 return 0;
7431 else
7432 return WC_NO_BEST_FIT_CHARS;
7433 }
7434}
7435
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 * Encode a Unicode string to a Windows code page into a byte string in strict
7438 * mode.
7439 *
7440 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007441 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007443static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007444encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007445 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447{
Victor Stinner554f3f02010-06-16 23:33:54 +00007448 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 BOOL *pusedDefaultChar = &usedDefaultChar;
7450 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007451 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const DWORD flags = encode_code_page_flags(code_page, NULL);
7454 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 /* Create a substring so that we can get the UTF-16 representation
7456 of just the slice under consideration. */
7457 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007462 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007464 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007465
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 substring = PyUnicode_Substring(unicode, offset, offset+len);
7467 if (substring == NULL)
7468 return -1;
7469 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7470 if (p == NULL) {
7471 Py_DECREF(substring);
7472 return -1;
7473 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007474 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007476 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007478 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 NULL, 0,
7480 NULL, pusedDefaultChar);
7481 if (outsize <= 0)
7482 goto error;
7483 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 if (pusedDefaultChar && *pusedDefaultChar) {
7485 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007487 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007488
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 if (*outbytes == NULL) {
7493 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497 }
7498 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 const Py_ssize_t n = PyBytes_Size(*outbytes);
7501 if (outsize > PY_SSIZE_T_MAX - n) {
7502 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7507 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007511 }
7512
7513 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007515 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 out, outsize,
7517 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007518 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 if (outsize <= 0)
7520 goto error;
7521 if (pusedDefaultChar && *pusedDefaultChar)
7522 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007524
Victor Stinner3a50e702011-10-18 21:21:00 +02007525error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7528 return -2;
7529 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007530 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007531}
7532
Victor Stinner3a50e702011-10-18 21:21:00 +02007533/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007534 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 * error handler.
7536 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007537 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 * -1 on other error.
7539 */
7540static int
7541encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007543 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007544{
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007546 Py_ssize_t pos = unicode_offset;
7547 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 /* Ideally, we should get reason from FormatMessage. This is the Windows
7549 2000 English version of the message. */
7550 const char *reason = "invalid character";
7551 /* 4=maximum length of a UTF-8 sequence */
7552 char buffer[4];
7553 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7554 Py_ssize_t outsize;
7555 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 PyObject *errorHandler = NULL;
7557 PyObject *exc = NULL;
7558 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007559 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007560 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 PyObject *rep;
7562 int ret = -1;
7563
7564 assert(insize > 0);
7565
7566 encoding = code_page_name(code_page, &encoding_obj);
7567 if (encoding == NULL)
7568 return -1;
7569
7570 if (errors == NULL || strcmp(errors, "strict") == 0) {
7571 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7572 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007573 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 if (exc != NULL) {
7575 PyCodec_StrictErrors(exc);
7576 Py_DECREF(exc);
7577 }
7578 Py_XDECREF(encoding_obj);
7579 return -1;
7580 }
7581
7582 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7583 pusedDefaultChar = &usedDefaultChar;
7584 else
7585 pusedDefaultChar = NULL;
7586
7587 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7588 PyErr_NoMemory();
7589 goto error;
7590 }
7591 outsize = insize * Py_ARRAY_LENGTH(buffer);
7592
7593 if (*outbytes == NULL) {
7594 /* Create string object */
7595 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7596 if (*outbytes == NULL)
7597 goto error;
7598 out = PyBytes_AS_STRING(*outbytes);
7599 }
7600 else {
7601 /* Extend string object */
7602 Py_ssize_t n = PyBytes_Size(*outbytes);
7603 if (n > PY_SSIZE_T_MAX - outsize) {
7604 PyErr_NoMemory();
7605 goto error;
7606 }
7607 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7608 goto error;
7609 out = PyBytes_AS_STRING(*outbytes) + n;
7610 }
7611
7612 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007613 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007615 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7616 wchar_t chars[2];
7617 int charsize;
7618 if (ch < 0x10000) {
7619 chars[0] = (wchar_t)ch;
7620 charsize = 1;
7621 }
7622 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007623 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7624 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 charsize = 2;
7626 }
7627
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 buffer, Py_ARRAY_LENGTH(buffer),
7631 NULL, pusedDefaultChar);
7632 if (outsize > 0) {
7633 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7634 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 memcpy(out, buffer, outsize);
7637 out += outsize;
7638 continue;
7639 }
7640 }
7641 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7642 PyErr_SetFromWindowsErr(0);
7643 goto error;
7644 }
7645
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 rep = unicode_encode_call_errorhandler(
7647 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007648 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007649 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 if (rep == NULL)
7651 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007653
7654 if (PyBytes_Check(rep)) {
7655 outsize = PyBytes_GET_SIZE(rep);
7656 if (outsize != 1) {
7657 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7658 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7659 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7660 Py_DECREF(rep);
7661 goto error;
7662 }
7663 out = PyBytes_AS_STRING(*outbytes) + offset;
7664 }
7665 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7666 out += outsize;
7667 }
7668 else {
7669 Py_ssize_t i;
7670 enum PyUnicode_Kind kind;
7671 void *data;
7672
Benjamin Petersonbac79492012-01-14 13:34:47 -05007673 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 Py_DECREF(rep);
7675 goto error;
7676 }
7677
7678 outsize = PyUnicode_GET_LENGTH(rep);
7679 if (outsize != 1) {
7680 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7681 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7682 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7683 Py_DECREF(rep);
7684 goto error;
7685 }
7686 out = PyBytes_AS_STRING(*outbytes) + offset;
7687 }
7688 kind = PyUnicode_KIND(rep);
7689 data = PyUnicode_DATA(rep);
7690 for (i=0; i < outsize; i++) {
7691 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7692 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007693 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007694 encoding, unicode,
7695 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 "unable to encode error handler result to ASCII");
7697 Py_DECREF(rep);
7698 goto error;
7699 }
7700 *out = (unsigned char)ch;
7701 out++;
7702 }
7703 }
7704 Py_DECREF(rep);
7705 }
7706 /* write a NUL byte */
7707 *out = 0;
7708 outsize = out - PyBytes_AS_STRING(*outbytes);
7709 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7710 if (_PyBytes_Resize(outbytes, outsize) < 0)
7711 goto error;
7712 ret = 0;
7713
7714error:
7715 Py_XDECREF(encoding_obj);
7716 Py_XDECREF(errorHandler);
7717 Py_XDECREF(exc);
7718 return ret;
7719}
7720
Victor Stinner3a50e702011-10-18 21:21:00 +02007721static PyObject *
7722encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 const char *errors)
7725{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007729 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007730
Victor Stinner29dacf22015-01-26 16:41:32 +01007731 if (!PyUnicode_Check(unicode)) {
7732 PyErr_BadArgument();
7733 return NULL;
7734 }
7735
Benjamin Petersonbac79492012-01-14 13:34:47 -05007736 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007737 return NULL;
7738 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007739
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 if (code_page < 0) {
7741 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7742 return NULL;
7743 }
7744
Martin v. Löwis3d325192011-11-04 18:23:06 +01007745 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007746 return PyBytes_FromStringAndSize(NULL, 0);
7747
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 offset = 0;
7749 do
7750 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007752 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 chunks. */
7754 if (len > INT_MAX/2) {
7755 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007756 done = 0;
7757 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007758 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007759#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007760 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007761 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 done = 1;
7763 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007764
Victor Stinner76a31a62011-11-04 00:05:13 +01007765 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007767 errors);
7768 if (ret == -2)
7769 ret = encode_code_page_errors(code_page, &outbytes,
7770 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007771 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007772 if (ret < 0) {
7773 Py_XDECREF(outbytes);
7774 return NULL;
7775 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007776
Victor Stinner7581cef2011-11-03 22:32:33 +01007777 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007778 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007780
Victor Stinner3a50e702011-10-18 21:21:00 +02007781 return outbytes;
7782}
7783
7784PyObject *
7785PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7786 Py_ssize_t size,
7787 const char *errors)
7788{
Victor Stinner7581cef2011-11-03 22:32:33 +01007789 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007790 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007791 if (unicode == NULL)
7792 return NULL;
7793 res = encode_code_page(CP_ACP, unicode, errors);
7794 Py_DECREF(unicode);
7795 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007796}
7797
7798PyObject *
7799PyUnicode_EncodeCodePage(int code_page,
7800 PyObject *unicode,
7801 const char *errors)
7802{
Victor Stinner7581cef2011-11-03 22:32:33 +01007803 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007804}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007805
Alexander Belopolsky40018472011-02-26 01:02:56 +00007806PyObject *
7807PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007808{
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007810}
7811
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007812#undef NEED_RETRY
7813
Steve Dowercc16be82016-09-08 10:35:16 -07007814#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007815
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816/* --- Character Mapping Codec -------------------------------------------- */
7817
Victor Stinnerfb161b12013-04-18 01:44:27 +02007818static int
7819charmap_decode_string(const char *s,
7820 Py_ssize_t size,
7821 PyObject *mapping,
7822 const char *errors,
7823 _PyUnicodeWriter *writer)
7824{
7825 const char *starts = s;
7826 const char *e;
7827 Py_ssize_t startinpos, endinpos;
7828 PyObject *errorHandler = NULL, *exc = NULL;
7829 Py_ssize_t maplen;
7830 enum PyUnicode_Kind mapkind;
7831 void *mapdata;
7832 Py_UCS4 x;
7833 unsigned char ch;
7834
7835 if (PyUnicode_READY(mapping) == -1)
7836 return -1;
7837
7838 maplen = PyUnicode_GET_LENGTH(mapping);
7839 mapdata = PyUnicode_DATA(mapping);
7840 mapkind = PyUnicode_KIND(mapping);
7841
7842 e = s + size;
7843
7844 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7845 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7846 * is disabled in encoding aliases, latin1 is preferred because
7847 * its implementation is faster. */
7848 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7849 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7850 Py_UCS4 maxchar = writer->maxchar;
7851
7852 assert (writer->kind == PyUnicode_1BYTE_KIND);
7853 while (s < e) {
7854 ch = *s;
7855 x = mapdata_ucs1[ch];
7856 if (x > maxchar) {
7857 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7858 goto onError;
7859 maxchar = writer->maxchar;
7860 outdata = (Py_UCS1 *)writer->data;
7861 }
7862 outdata[writer->pos] = x;
7863 writer->pos++;
7864 ++s;
7865 }
7866 return 0;
7867 }
7868
7869 while (s < e) {
7870 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7871 enum PyUnicode_Kind outkind = writer->kind;
7872 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7873 if (outkind == PyUnicode_1BYTE_KIND) {
7874 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7875 Py_UCS4 maxchar = writer->maxchar;
7876 while (s < e) {
7877 ch = *s;
7878 x = mapdata_ucs2[ch];
7879 if (x > maxchar)
7880 goto Error;
7881 outdata[writer->pos] = x;
7882 writer->pos++;
7883 ++s;
7884 }
7885 break;
7886 }
7887 else if (outkind == PyUnicode_2BYTE_KIND) {
7888 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7889 while (s < e) {
7890 ch = *s;
7891 x = mapdata_ucs2[ch];
7892 if (x == 0xFFFE)
7893 goto Error;
7894 outdata[writer->pos] = x;
7895 writer->pos++;
7896 ++s;
7897 }
7898 break;
7899 }
7900 }
7901 ch = *s;
7902
7903 if (ch < maplen)
7904 x = PyUnicode_READ(mapkind, mapdata, ch);
7905 else
7906 x = 0xfffe; /* invalid value */
7907Error:
7908 if (x == 0xfffe)
7909 {
7910 /* undefined mapping */
7911 startinpos = s-starts;
7912 endinpos = startinpos+1;
7913 if (unicode_decode_call_errorhandler_writer(
7914 errors, &errorHandler,
7915 "charmap", "character maps to <undefined>",
7916 &starts, &e, &startinpos, &endinpos, &exc, &s,
7917 writer)) {
7918 goto onError;
7919 }
7920 continue;
7921 }
7922
7923 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7924 goto onError;
7925 ++s;
7926 }
7927 Py_XDECREF(errorHandler);
7928 Py_XDECREF(exc);
7929 return 0;
7930
7931onError:
7932 Py_XDECREF(errorHandler);
7933 Py_XDECREF(exc);
7934 return -1;
7935}
7936
7937static int
7938charmap_decode_mapping(const char *s,
7939 Py_ssize_t size,
7940 PyObject *mapping,
7941 const char *errors,
7942 _PyUnicodeWriter *writer)
7943{
7944 const char *starts = s;
7945 const char *e;
7946 Py_ssize_t startinpos, endinpos;
7947 PyObject *errorHandler = NULL, *exc = NULL;
7948 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007949 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007950
7951 e = s + size;
7952
7953 while (s < e) {
7954 ch = *s;
7955
7956 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7957 key = PyLong_FromLong((long)ch);
7958 if (key == NULL)
7959 goto onError;
7960
7961 item = PyObject_GetItem(mapping, key);
7962 Py_DECREF(key);
7963 if (item == NULL) {
7964 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7965 /* No mapping found means: mapping is undefined. */
7966 PyErr_Clear();
7967 goto Undefined;
7968 } else
7969 goto onError;
7970 }
7971
7972 /* Apply mapping */
7973 if (item == Py_None)
7974 goto Undefined;
7975 if (PyLong_Check(item)) {
7976 long value = PyLong_AS_LONG(item);
7977 if (value == 0xFFFE)
7978 goto Undefined;
7979 if (value < 0 || value > MAX_UNICODE) {
7980 PyErr_Format(PyExc_TypeError,
7981 "character mapping must be in range(0x%lx)",
7982 (unsigned long)MAX_UNICODE + 1);
7983 goto onError;
7984 }
7985
7986 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7987 goto onError;
7988 }
7989 else if (PyUnicode_Check(item)) {
7990 if (PyUnicode_READY(item) == -1)
7991 goto onError;
7992 if (PyUnicode_GET_LENGTH(item) == 1) {
7993 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7994 if (value == 0xFFFE)
7995 goto Undefined;
7996 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7997 goto onError;
7998 }
7999 else {
8000 writer->overallocate = 1;
8001 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8002 goto onError;
8003 }
8004 }
8005 else {
8006 /* wrong return value */
8007 PyErr_SetString(PyExc_TypeError,
8008 "character mapping must return integer, None or str");
8009 goto onError;
8010 }
8011 Py_CLEAR(item);
8012 ++s;
8013 continue;
8014
8015Undefined:
8016 /* undefined mapping */
8017 Py_CLEAR(item);
8018 startinpos = s-starts;
8019 endinpos = startinpos+1;
8020 if (unicode_decode_call_errorhandler_writer(
8021 errors, &errorHandler,
8022 "charmap", "character maps to <undefined>",
8023 &starts, &e, &startinpos, &endinpos, &exc, &s,
8024 writer)) {
8025 goto onError;
8026 }
8027 }
8028 Py_XDECREF(errorHandler);
8029 Py_XDECREF(exc);
8030 return 0;
8031
8032onError:
8033 Py_XDECREF(item);
8034 Py_XDECREF(errorHandler);
8035 Py_XDECREF(exc);
8036 return -1;
8037}
8038
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039PyObject *
8040PyUnicode_DecodeCharmap(const char *s,
8041 Py_ssize_t size,
8042 PyObject *mapping,
8043 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008045 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 /* Default to Latin-1 */
8048 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008052 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008053 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008054 writer.min_length = size;
8055 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008057
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008058 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008059 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8060 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008061 }
8062 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008063 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8064 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008066 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008067
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008069 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 return NULL;
8071}
8072
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073/* Charmap encoding: the lookup table */
8074
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 PyObject_HEAD
8077 unsigned char level1[32];
8078 int count2, count3;
8079 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080};
8081
8082static PyObject*
8083encoding_map_size(PyObject *obj, PyObject* args)
8084{
8085 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008086 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088}
8089
8090static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 PyDoc_STR("Return the size (in bytes) of this object") },
8093 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094};
8095
8096static void
8097encoding_map_dealloc(PyObject* o)
8098{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100}
8101
8102static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008103 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 "EncodingMap", /*tp_name*/
8105 sizeof(struct encoding_map), /*tp_basicsize*/
8106 0, /*tp_itemsize*/
8107 /* methods */
8108 encoding_map_dealloc, /*tp_dealloc*/
8109 0, /*tp_print*/
8110 0, /*tp_getattr*/
8111 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008112 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 0, /*tp_repr*/
8114 0, /*tp_as_number*/
8115 0, /*tp_as_sequence*/
8116 0, /*tp_as_mapping*/
8117 0, /*tp_hash*/
8118 0, /*tp_call*/
8119 0, /*tp_str*/
8120 0, /*tp_getattro*/
8121 0, /*tp_setattro*/
8122 0, /*tp_as_buffer*/
8123 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8124 0, /*tp_doc*/
8125 0, /*tp_traverse*/
8126 0, /*tp_clear*/
8127 0, /*tp_richcompare*/
8128 0, /*tp_weaklistoffset*/
8129 0, /*tp_iter*/
8130 0, /*tp_iternext*/
8131 encoding_map_methods, /*tp_methods*/
8132 0, /*tp_members*/
8133 0, /*tp_getset*/
8134 0, /*tp_base*/
8135 0, /*tp_dict*/
8136 0, /*tp_descr_get*/
8137 0, /*tp_descr_set*/
8138 0, /*tp_dictoffset*/
8139 0, /*tp_init*/
8140 0, /*tp_alloc*/
8141 0, /*tp_new*/
8142 0, /*tp_free*/
8143 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144};
8145
8146PyObject*
8147PyUnicode_BuildEncodingMap(PyObject* string)
8148{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 PyObject *result;
8150 struct encoding_map *mresult;
8151 int i;
8152 int need_dict = 0;
8153 unsigned char level1[32];
8154 unsigned char level2[512];
8155 unsigned char *mlevel1, *mlevel2, *mlevel3;
8156 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 int kind;
8158 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008159 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008162 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 PyErr_BadArgument();
8164 return NULL;
8165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 kind = PyUnicode_KIND(string);
8167 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 length = PyUnicode_GET_LENGTH(string);
8169 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 memset(level1, 0xFF, sizeof level1);
8171 memset(level2, 0xFF, sizeof level2);
8172
8173 /* If there isn't a one-to-one mapping of NULL to \0,
8174 or if there are non-BMP characters, we need to use
8175 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008178 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 ch = PyUnicode_READ(kind, data, i);
8181 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 need_dict = 1;
8183 break;
8184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 /* unmapped character */
8187 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 l1 = ch >> 11;
8189 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 if (level1[l1] == 0xFF)
8191 level1[l1] = count2++;
8192 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008194 }
8195
8196 if (count2 >= 0xFF || count3 >= 0xFF)
8197 need_dict = 1;
8198
8199 if (need_dict) {
8200 PyObject *result = PyDict_New();
8201 PyObject *key, *value;
8202 if (!result)
8203 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008204 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008207 if (!key || !value)
8208 goto failed1;
8209 if (PyDict_SetItem(result, key, value) == -1)
8210 goto failed1;
8211 Py_DECREF(key);
8212 Py_DECREF(value);
8213 }
8214 return result;
8215 failed1:
8216 Py_XDECREF(key);
8217 Py_XDECREF(value);
8218 Py_DECREF(result);
8219 return NULL;
8220 }
8221
8222 /* Create a three-level trie */
8223 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8224 16*count2 + 128*count3 - 1);
8225 if (!result)
8226 return PyErr_NoMemory();
8227 PyObject_Init(result, &EncodingMapType);
8228 mresult = (struct encoding_map*)result;
8229 mresult->count2 = count2;
8230 mresult->count3 = count3;
8231 mlevel1 = mresult->level1;
8232 mlevel2 = mresult->level23;
8233 mlevel3 = mresult->level23 + 16*count2;
8234 memcpy(mlevel1, level1, 32);
8235 memset(mlevel2, 0xFF, 16*count2);
8236 memset(mlevel3, 0, 128*count3);
8237 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008238 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008240 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8241 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 /* unmapped character */
8243 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 o1 = ch>>11;
8245 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 i2 = 16*mlevel1[o1] + o2;
8247 if (mlevel2[i2] == 0xFF)
8248 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008249 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 i3 = 128*mlevel2[i2] + o3;
8251 mlevel3[i3] = i;
8252 }
8253 return result;
8254}
8255
8256static int
Victor Stinner22168992011-11-20 17:09:18 +01008257encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258{
8259 struct encoding_map *map = (struct encoding_map*)mapping;
8260 int l1 = c>>11;
8261 int l2 = (c>>7) & 0xF;
8262 int l3 = c & 0x7F;
8263 int i;
8264
Victor Stinner22168992011-11-20 17:09:18 +01008265 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267 if (c == 0)
8268 return 0;
8269 /* level 1*/
8270 i = map->level1[l1];
8271 if (i == 0xFF) {
8272 return -1;
8273 }
8274 /* level 2*/
8275 i = map->level23[16*i+l2];
8276 if (i == 0xFF) {
8277 return -1;
8278 }
8279 /* level 3 */
8280 i = map->level23[16*map->count2 + 128*i + l3];
8281 if (i == 0) {
8282 return -1;
8283 }
8284 return i;
8285}
8286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287/* Lookup the character ch in the mapping. If the character
8288 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008289 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008291charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292{
Christian Heimes217cfd12007-12-02 14:31:20 +00008293 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 PyObject *x;
8295
8296 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 x = PyObject_GetItem(mapping, w);
8299 Py_DECREF(w);
8300 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8302 /* No mapping found means: mapping is undefined. */
8303 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008304 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 } else
8306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008308 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 if (value < 0 || value > 255) {
8313 PyErr_SetString(PyExc_TypeError,
8314 "character mapping must be in range(256)");
8315 Py_DECREF(x);
8316 return NULL;
8317 }
8318 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008320 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 /* wrong return value */
8324 PyErr_Format(PyExc_TypeError,
8325 "character mapping must return integer, bytes or None, not %.400s",
8326 x->ob_type->tp_name);
8327 Py_DECREF(x);
8328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 }
8330}
8331
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008333charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8336 /* exponentially overallocate to minimize reallocations */
8337 if (requiredsize < 2*outsize)
8338 requiredsize = 2*outsize;
8339 if (_PyBytes_Resize(outobj, requiredsize))
8340 return -1;
8341 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342}
8343
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008348 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 space is available. Return a new reference to the object that
8350 was put in the output buffer, or Py_None, if the mapping was undefined
8351 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008352 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008354charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 PyObject *rep;
8358 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008359 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360
Christian Heimes90aa7642007-12-19 02:45:37 +00008361 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364 if (res == -1)
8365 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize))
8368 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008369 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 outstart[(*outpos)++] = (char)res;
8371 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 }
8373
8374 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 Py_DECREF(rep);
8379 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyLong_Check(rep)) {
8382 Py_ssize_t requiredsize = *outpos+1;
8383 if (outsize<requiredsize)
8384 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8385 Py_DECREF(rep);
8386 return enc_EXCEPTION;
8387 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008388 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 else {
8392 const char *repchars = PyBytes_AS_STRING(rep);
8393 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8394 Py_ssize_t requiredsize = *outpos+repsize;
8395 if (outsize<requiredsize)
8396 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8397 Py_DECREF(rep);
8398 return enc_EXCEPTION;
8399 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008400 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 memcpy(outstart + *outpos, repchars, repsize);
8402 *outpos += repsize;
8403 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 Py_DECREF(rep);
8406 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407}
8408
8409/* handle an error in PyUnicode_EncodeCharmap
8410 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008411static int
8412charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008415 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008416 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417{
8418 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008420 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008421 enum PyUnicode_Kind kind;
8422 void *data;
8423 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008425 Py_ssize_t collstartpos = *inpos;
8426 Py_ssize_t collendpos = *inpos+1;
8427 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008428 const char *encoding = "charmap";
8429 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008432 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Benjamin Petersonbac79492012-01-14 13:34:47 -05008434 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008435 return -1;
8436 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 /* find all unencodable characters */
8438 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008439 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008440 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008442 val = encoding_map_lookup(ch, mapping);
8443 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 break;
8445 ++collendpos;
8446 continue;
8447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8450 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 if (rep==NULL)
8452 return -1;
8453 else if (rep!=Py_None) {
8454 Py_DECREF(rep);
8455 break;
8456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 }
8460 /* cache callback name lookup
8461 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008462 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008463 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008464
8465 switch (*error_handler) {
8466 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008469
8470 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 x = charmapencode_output('?', mapping, res, respos);
8473 if (x==enc_EXCEPTION) {
8474 return -1;
8475 }
8476 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008477 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 }
8481 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008482 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 *inpos = collendpos;
8484 break;
Victor Stinner50149202015-09-22 00:26:54 +02008485
8486 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 /* generate replacement (temporarily (mis)uses p) */
8488 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 char buffer[2+29+1+1];
8490 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 for (cp = buffer; *cp; ++cp) {
8493 x = charmapencode_output(*cp, mapping, res, respos);
8494 if (x==enc_EXCEPTION)
8495 return -1;
8496 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008497 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return -1;
8499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 }
8501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008502 *inpos = collendpos;
8503 break;
Victor Stinner50149202015-09-22 00:26:54 +02008504
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 default:
Victor Stinner50149202015-09-22 00:26:54 +02008506 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008511 if (PyBytes_Check(repunicode)) {
8512 /* Directly copy bytes result to output. */
8513 Py_ssize_t outsize = PyBytes_Size(*res);
8514 Py_ssize_t requiredsize;
8515 repsize = PyBytes_Size(repunicode);
8516 requiredsize = *respos + repsize;
8517 if (requiredsize > outsize)
8518 /* Make room for all additional bytes. */
8519 if (charmapencode_resize(res, respos, requiredsize)) {
8520 Py_DECREF(repunicode);
8521 return -1;
8522 }
8523 memcpy(PyBytes_AsString(*res) + *respos,
8524 PyBytes_AsString(repunicode), repsize);
8525 *respos += repsize;
8526 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008527 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008528 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008531 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008532 Py_DECREF(repunicode);
8533 return -1;
8534 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008535 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008536 data = PyUnicode_DATA(repunicode);
8537 kind = PyUnicode_KIND(repunicode);
8538 for (index = 0; index < repsize; index++) {
8539 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8540 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008542 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 return -1;
8544 }
8545 else if (x==enc_FAILED) {
8546 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008547 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 return -1;
8549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550 }
8551 *inpos = newpos;
8552 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 }
8554 return 0;
8555}
8556
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558_PyUnicode_EncodeCharmap(PyObject *unicode,
8559 PyObject *mapping,
8560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 /* output object */
8563 PyObject *res = NULL;
8564 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008565 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008568 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008569 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008571 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008572 void *data;
8573 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Benjamin Petersonbac79492012-01-14 13:34:47 -05008575 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 return NULL;
8577 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008578 data = PyUnicode_DATA(unicode);
8579 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 /* Default to Latin-1 */
8582 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 /* allocate enough for a simple encoding without
8586 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008587 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 if (res == NULL)
8589 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008590 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008594 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008596 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 if (x==enc_EXCEPTION) /* error */
8598 goto onError;
8599 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008602 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 &res, &respos)) {
8604 goto onError;
8605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 else
8608 /* done with this character => adjust input position */
8609 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008613 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008614 if (_PyBytes_Resize(&res, respos) < 0)
8615 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008618 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 return res;
8620
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 Py_XDECREF(res);
8623 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008624 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 return NULL;
8626}
8627
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628/* Deprecated */
8629PyObject *
8630PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8631 Py_ssize_t size,
8632 PyObject *mapping,
8633 const char *errors)
8634{
8635 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008636 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008637 if (unicode == NULL)
8638 return NULL;
8639 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8640 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008641 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008642}
8643
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644PyObject *
8645PyUnicode_AsCharmapString(PyObject *unicode,
8646 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647{
8648 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 PyErr_BadArgument();
8650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008652 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653}
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008656static void
8657make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 *exceptionObject = _PyUnicodeTranslateError_Create(
8664 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8668 goto onError;
8669 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8670 goto onError;
8671 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8672 goto onError;
8673 return;
8674 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008675 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
8677}
8678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679/* error handling callback helper:
8680 build arguments, call the callback and check the arguments,
8681 put the result into newpos and return the replacement string, which
8682 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008683static PyObject *
8684unicode_translate_call_errorhandler(const char *errors,
8685 PyObject **errorHandler,
8686 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688 Py_ssize_t startpos, Py_ssize_t endpos,
8689 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008691 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008693 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 PyObject *restuple;
8695 PyObject *resunicode;
8696
8697 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
8702
8703 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008708 restuple = PyObject_CallFunctionObjArgs(
8709 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008713 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_DECREF(restuple);
8715 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008717 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 &resunicode, &i_newpos)) {
8719 Py_DECREF(restuple);
8720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008722 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008724 else
8725 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008727 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 Py_DECREF(restuple);
8729 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 Py_INCREF(resunicode);
8732 Py_DECREF(restuple);
8733 return resunicode;
8734}
8735
8736/* Lookup the character ch in the mapping and put the result in result,
8737 which must be decrefed by the caller.
8738 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008739static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741{
Christian Heimes217cfd12007-12-02 14:31:20 +00008742 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 PyObject *x;
8744
8745 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 x = PyObject_GetItem(mapping, w);
8748 Py_DECREF(w);
8749 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8751 /* No mapping found means: use 1:1 mapping. */
8752 PyErr_Clear();
8753 *result = NULL;
8754 return 0;
8755 } else
8756 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 }
8758 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 *result = x;
8760 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008762 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008764 if (value < 0 || value > MAX_UNICODE) {
8765 PyErr_Format(PyExc_ValueError,
8766 "character mapping must be in range(0x%x)",
8767 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 Py_DECREF(x);
8769 return -1;
8770 }
8771 *result = x;
8772 return 0;
8773 }
8774 else if (PyUnicode_Check(x)) {
8775 *result = x;
8776 return 0;
8777 }
8778 else {
8779 /* wrong return value */
8780 PyErr_SetString(PyExc_TypeError,
8781 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008782 Py_DECREF(x);
8783 return -1;
8784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785}
Victor Stinner1194ea02014-04-04 19:37:40 +02008786
8787/* lookup the character, write the result into the writer.
8788 Return 1 if the result was written into the writer, return 0 if the mapping
8789 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008791charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8792 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793{
Victor Stinner1194ea02014-04-04 19:37:40 +02008794 PyObject *item;
8795
8796 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008798
8799 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008801 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008804 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008806
8807 if (item == Py_None) {
8808 Py_DECREF(item);
8809 return 0;
8810 }
8811
8812 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008813 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8814 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8815 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008816 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8817 Py_DECREF(item);
8818 return -1;
8819 }
8820 Py_DECREF(item);
8821 return 1;
8822 }
8823
8824 if (!PyUnicode_Check(item)) {
8825 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008827 }
8828
8829 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8830 Py_DECREF(item);
8831 return -1;
8832 }
8833
8834 Py_DECREF(item);
8835 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836}
8837
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838static int
8839unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8840 Py_UCS1 *translate)
8841{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008842 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 int ret = 0;
8844
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 if (charmaptranslate_lookup(ch, mapping, &item)) {
8846 return -1;
8847 }
8848
8849 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008850 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008851 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008853 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 /* not found => default to 1:1 mapping */
8855 translate[ch] = ch;
8856 return 1;
8857 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008859 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008860 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8861 used it */
8862 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 /* invalid character or character outside ASCII:
8864 skip the fast translate */
8865 goto exit;
8866 }
8867 translate[ch] = (Py_UCS1)replace;
8868 }
8869 else if (PyUnicode_Check(item)) {
8870 Py_UCS4 replace;
8871
8872 if (PyUnicode_READY(item) == -1) {
8873 Py_DECREF(item);
8874 return -1;
8875 }
8876 if (PyUnicode_GET_LENGTH(item) != 1)
8877 goto exit;
8878
8879 replace = PyUnicode_READ_CHAR(item, 0);
8880 if (replace > 127)
8881 goto exit;
8882 translate[ch] = (Py_UCS1)replace;
8883 }
8884 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008885 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886 goto exit;
8887 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 ret = 1;
8889
Benjamin Peterson1365de72014-04-07 20:15:41 -04008890 exit:
8891 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 return ret;
8893}
8894
8895/* Fast path for ascii => ascii translation. Return 1 if the whole string
8896 was translated into writer, return 0 if the input string was partially
8897 translated into writer, raise an exception and return -1 on error. */
8898static int
8899unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008900 _PyUnicodeWriter *writer, int ignore,
8901 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902{
Victor Stinner872b2912014-04-05 14:27:07 +02008903 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 Py_ssize_t len;
8905 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008906 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 len = PyUnicode_GET_LENGTH(input);
8909
Victor Stinner872b2912014-04-05 14:27:07 +02008910 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911
8912 in = PyUnicode_1BYTE_DATA(input);
8913 end = in + len;
8914
8915 assert(PyUnicode_IS_ASCII(writer->buffer));
8916 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8917 out = PyUnicode_1BYTE_DATA(writer->buffer);
8918
Victor Stinner872b2912014-04-05 14:27:07 +02008919 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008921 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008923 int translate = unicode_fast_translate_lookup(mapping, ch,
8924 ascii_table);
8925 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008927 if (translate == 0)
8928 goto exit;
8929 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930 }
Victor Stinner872b2912014-04-05 14:27:07 +02008931 if (ch2 == 0xfe) {
8932 if (ignore)
8933 continue;
8934 goto exit;
8935 }
8936 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008938 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 }
Victor Stinner872b2912014-04-05 14:27:07 +02008940 res = 1;
8941
8942exit:
8943 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008944 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008945 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008946}
8947
Victor Stinner3222da22015-10-01 22:07:32 +02008948static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949_PyUnicode_TranslateCharmap(PyObject *input,
8950 PyObject *mapping,
8951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008954 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 Py_ssize_t size, i;
8956 int kind;
8957 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 _PyUnicodeWriter writer;
8959 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008960 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 PyObject *errorHandler = NULL;
8962 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 PyErr_BadArgument();
8968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (PyUnicode_READY(input) == -1)
8972 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008973 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 kind = PyUnicode_KIND(input);
8975 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008977 if (size == 0)
8978 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 /* allocate enough for a simple 1:1 translation without
8981 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 _PyUnicodeWriter_Init(&writer);
8983 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Victor Stinner872b2912014-04-05 14:27:07 +02008986 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8987
Victor Stinner33798672016-03-01 21:59:58 +01008988 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008990 if (PyUnicode_IS_ASCII(input)) {
8991 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8992 if (res < 0) {
8993 _PyUnicodeWriter_Dealloc(&writer);
8994 return NULL;
8995 }
8996 if (res == 1)
8997 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008998 }
Victor Stinner33798672016-03-01 21:59:58 +01008999 else {
9000 i = 0;
9001 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 int translate;
9006 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9007 Py_ssize_t newpos;
9008 /* startpos for collecting untranslatable chars */
9009 Py_ssize_t collstart;
9010 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 ch = PyUnicode_READ(kind, data, i);
9014 translate = charmaptranslate_output(ch, mapping, &writer);
9015 if (translate < 0)
9016 goto onError;
9017
9018 if (translate != 0) {
9019 /* it worked => adjust input pointer */
9020 ++i;
9021 continue;
9022 }
9023
9024 /* untranslatable character */
9025 collstart = i;
9026 collend = i+1;
9027
9028 /* find all untranslatable characters */
9029 while (collend < size) {
9030 PyObject *x;
9031 ch = PyUnicode_READ(kind, data, collend);
9032 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009033 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 Py_XDECREF(x);
9035 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 ++collend;
9038 }
9039
9040 if (ignore) {
9041 i = collend;
9042 }
9043 else {
9044 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9045 reason, input, &exc,
9046 collstart, collend, &newpos);
9047 if (repunicode == NULL)
9048 goto onError;
9049 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009051 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009052 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009053 Py_DECREF(repunicode);
9054 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009055 }
9056 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 Py_XDECREF(exc);
9058 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 Py_XDECREF(exc);
9064 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 return NULL;
9066}
9067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068/* Deprecated. Use PyUnicode_Translate instead. */
9069PyObject *
9070PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9071 Py_ssize_t size,
9072 PyObject *mapping,
9073 const char *errors)
9074{
Christian Heimes5f520f42012-09-11 14:03:25 +02009075 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009076 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 if (!unicode)
9078 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009079 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9080 Py_DECREF(unicode);
9081 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082}
9083
Alexander Belopolsky40018472011-02-26 01:02:56 +00009084PyObject *
9085PyUnicode_Translate(PyObject *str,
9086 PyObject *mapping,
9087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009089 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009090 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009091 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
Tim Petersced69f82003-09-16 20:30:58 +00009093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094PyObject *
9095_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9096{
9097 if (!PyUnicode_Check(unicode)) {
9098 PyErr_BadInternalCall();
9099 return NULL;
9100 }
9101 if (PyUnicode_READY(unicode) == -1)
9102 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009103 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 /* If the string is already ASCII, just return the same string */
9105 Py_INCREF(unicode);
9106 return unicode;
9107 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009108
9109 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9110 PyObject *result = PyUnicode_New(len, 127);
9111 if (result == NULL) {
9112 return NULL;
9113 }
9114
9115 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9116 int kind = PyUnicode_KIND(unicode);
9117 const void *data = PyUnicode_DATA(unicode);
9118 Py_ssize_t i;
9119 for (i = 0; i < len; ++i) {
9120 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9121 if (ch < 127) {
9122 out[i] = ch;
9123 }
9124 else if (Py_UNICODE_ISSPACE(ch)) {
9125 out[i] = ' ';
9126 }
9127 else {
9128 int decimal = Py_UNICODE_TODECIMAL(ch);
9129 if (decimal < 0) {
9130 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009131 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009132 _PyUnicode_LENGTH(result) = i + 1;
9133 break;
9134 }
9135 out[i] = '0' + decimal;
9136 }
9137 }
9138
INADA Naoki16dfca42018-07-14 12:06:43 +09009139 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009140 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141}
9142
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009143PyObject *
9144PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9145 Py_ssize_t length)
9146{
Victor Stinnerf0124502011-11-21 23:12:56 +01009147 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009148 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009149 Py_UCS4 maxchar;
9150 enum PyUnicode_Kind kind;
9151 void *data;
9152
Victor Stinner99d7ad02012-02-22 13:37:39 +01009153 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009155 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156 if (ch > 127) {
9157 int decimal = Py_UNICODE_TODECIMAL(ch);
9158 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009159 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009160 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161 }
9162 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009163
9164 /* Copy to a new string */
9165 decimal = PyUnicode_New(length, maxchar);
9166 if (decimal == NULL)
9167 return decimal;
9168 kind = PyUnicode_KIND(decimal);
9169 data = PyUnicode_DATA(decimal);
9170 /* Iterate over code points */
9171 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009172 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009173 if (ch > 127) {
9174 int decimal = Py_UNICODE_TODECIMAL(ch);
9175 if (decimal >= 0)
9176 ch = '0' + decimal;
9177 }
9178 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009180 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009181}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182/* --- Decimal Encoder ---------------------------------------------------- */
9183
Alexander Belopolsky40018472011-02-26 01:02:56 +00009184int
9185PyUnicode_EncodeDecimal(Py_UNICODE *s,
9186 Py_ssize_t length,
9187 char *output,
9188 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009189{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009190 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009191 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009192 enum PyUnicode_Kind kind;
9193 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009194
9195 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 PyErr_BadArgument();
9197 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198 }
9199
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009200 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009201 if (unicode == NULL)
9202 return -1;
9203
Victor Stinner42bf7752011-11-21 22:52:58 +01009204 kind = PyUnicode_KIND(unicode);
9205 data = PyUnicode_DATA(unicode);
9206
Victor Stinnerb84d7232011-11-22 01:50:07 +01009207 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009208 PyObject *exc;
9209 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009211 Py_ssize_t startpos;
9212
9213 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009214
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009216 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009217 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 decimal = Py_UNICODE_TODECIMAL(ch);
9221 if (decimal >= 0) {
9222 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009223 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 continue;
9225 }
9226 if (0 < ch && ch < 256) {
9227 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009228 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 continue;
9230 }
Victor Stinner6345be92011-11-25 20:09:01 +01009231
Victor Stinner42bf7752011-11-21 22:52:58 +01009232 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009233 exc = NULL;
9234 raise_encode_exception(&exc, "decimal", unicode,
9235 startpos, startpos+1,
9236 "invalid decimal Unicode string");
9237 Py_XDECREF(exc);
9238 Py_DECREF(unicode);
9239 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009240 }
9241 /* 0-terminate the output string */
9242 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009243 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009244 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009245}
9246
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247/* --- Helpers ------------------------------------------------------------ */
9248
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249/* helper macro to fixup start/end slice values */
9250#define ADJUST_INDICES(start, end, len) \
9251 if (end > len) \
9252 end = len; \
9253 else if (end < 0) { \
9254 end += len; \
9255 if (end < 0) \
9256 end = 0; \
9257 } \
9258 if (start < 0) { \
9259 start += len; \
9260 if (start < 0) \
9261 start = 0; \
9262 }
9263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009265any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009267 Py_ssize_t end,
9268 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009270 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 void *buf1, *buf2;
9272 Py_ssize_t len1, len2, result;
9273
9274 kind1 = PyUnicode_KIND(s1);
9275 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009276 if (kind1 < kind2)
9277 return -1;
9278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 len1 = PyUnicode_GET_LENGTH(s1);
9280 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 ADJUST_INDICES(start, end, len1);
9282 if (end - start < len2)
9283 return -1;
9284
9285 buf1 = PyUnicode_DATA(s1);
9286 buf2 = PyUnicode_DATA(s2);
9287 if (len2 == 1) {
9288 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9289 result = findchar((const char *)buf1 + kind1*start,
9290 kind1, end - start, ch, direction);
9291 if (result == -1)
9292 return -1;
9293 else
9294 return start + result;
9295 }
9296
9297 if (kind2 != kind1) {
9298 buf2 = _PyUnicode_AsKind(s2, kind1);
9299 if (!buf2)
9300 return -2;
9301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302
Victor Stinner794d5672011-10-10 03:21:36 +02009303 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009304 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009305 case PyUnicode_1BYTE_KIND:
9306 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9307 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9308 else
9309 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 case PyUnicode_2BYTE_KIND:
9312 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9313 break;
9314 case PyUnicode_4BYTE_KIND:
9315 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9316 break;
9317 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009318 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009319 }
9320 }
9321 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009322 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009323 case PyUnicode_1BYTE_KIND:
9324 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9325 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9326 else
9327 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9328 break;
9329 case PyUnicode_2BYTE_KIND:
9330 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9331 break;
9332 case PyUnicode_4BYTE_KIND:
9333 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 break;
9335 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009336 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 }
9339
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009340 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 PyMem_Free(buf2);
9342
9343 return result;
9344}
9345
Victor Stinner59423e32018-11-26 13:40:01 +01009346/* _PyUnicode_InsertThousandsGrouping() helper functions */
9347#include "stringlib/localeutil.h"
9348
9349/**
9350 * InsertThousandsGrouping:
9351 * @writer: Unicode writer.
9352 * @n_buffer: Number of characters in @buffer.
9353 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9354 * @d_pos: Start of digits string.
9355 * @n_digits: The number of digits in the string, in which we want
9356 * to put the grouping chars.
9357 * @min_width: The minimum width of the digits in the output string.
9358 * Output will be zero-padded on the left to fill.
9359 * @grouping: see definition in localeconv().
9360 * @thousands_sep: see definition in localeconv().
9361 *
9362 * There are 2 modes: counting and filling. If @writer is NULL,
9363 * we are in counting mode, else filling mode.
9364 * If counting, the required buffer size is returned.
9365 * If filling, we know the buffer will be large enough, so we don't
9366 * need to pass in the buffer size.
9367 * Inserts thousand grouping characters (as defined by grouping and
9368 * thousands_sep) into @writer.
9369 *
9370 * Return value: -1 on error, number of characters otherwise.
9371 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009373_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009374 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009376 PyObject *digits,
9377 Py_ssize_t d_pos,
9378 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009380 const char *grouping,
9381 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383{
Victor Stinner59423e32018-11-26 13:40:01 +01009384 if (writer) {
9385 assert(digits != NULL);
9386 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009387 }
9388 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009389 assert(digits == NULL);
9390 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009391 }
Victor Stinner59423e32018-11-26 13:40:01 +01009392 assert(0 <= d_pos);
9393 assert(0 <= n_digits);
9394 assert(0 <= min_width);
9395 assert(grouping != NULL);
9396
9397 if (digits != NULL) {
9398 if (PyUnicode_READY(digits) == -1) {
9399 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009400 }
Victor Stinner59423e32018-11-26 13:40:01 +01009401 }
9402 if (PyUnicode_READY(thousands_sep) == -1) {
9403 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009404 }
9405
Victor Stinner59423e32018-11-26 13:40:01 +01009406 Py_ssize_t count = 0;
9407 Py_ssize_t n_zeros;
9408 int loop_broken = 0;
9409 int use_separator = 0; /* First time through, don't append the
9410 separator. They only go between
9411 groups. */
9412 Py_ssize_t buffer_pos;
9413 Py_ssize_t digits_pos;
9414 Py_ssize_t len;
9415 Py_ssize_t n_chars;
9416 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9417 be looked at */
9418 /* A generator that returns all of the grouping widths, until it
9419 returns 0. */
9420 GroupGenerator groupgen;
9421 GroupGenerator_init(&groupgen, grouping);
9422 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9423
9424 /* if digits are not grouped, thousands separator
9425 should be an empty string */
9426 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9427
9428 digits_pos = d_pos + n_digits;
9429 if (writer) {
9430 buffer_pos = writer->pos + n_buffer;
9431 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9432 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 }
Victor Stinner59423e32018-11-26 13:40:01 +01009434 else {
9435 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009436 }
Victor Stinner59423e32018-11-26 13:40:01 +01009437
9438 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009439 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009440 }
Victor Stinner59423e32018-11-26 13:40:01 +01009441
9442 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9443 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9444 n_zeros = Py_MAX(0, len - remaining);
9445 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9446
9447 /* Use n_zero zero's and n_chars chars */
9448
9449 /* Count only, don't do anything. */
9450 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9451
9452 /* Copy into the writer. */
9453 InsertThousandsGrouping_fill(writer, &buffer_pos,
9454 digits, &digits_pos,
9455 n_chars, n_zeros,
9456 use_separator ? thousands_sep : NULL,
9457 thousands_sep_len, maxchar);
9458
9459 /* Use a separator next time. */
9460 use_separator = 1;
9461
9462 remaining -= n_chars;
9463 min_width -= len;
9464
9465 if (remaining <= 0 && min_width <= 0) {
9466 loop_broken = 1;
9467 break;
9468 }
9469 min_width -= thousands_sep_len;
9470 }
9471 if (!loop_broken) {
9472 /* We left the loop without using a break statement. */
9473
9474 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9475 n_zeros = Py_MAX(0, len - remaining);
9476 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9477
9478 /* Use n_zero zero's and n_chars chars */
9479 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9480
9481 /* Copy into the writer. */
9482 InsertThousandsGrouping_fill(writer, &buffer_pos,
9483 digits, &digits_pos,
9484 n_chars, n_zeros,
9485 use_separator ? thousands_sep : NULL,
9486 thousands_sep_len, maxchar);
9487 }
9488 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489}
9490
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492Py_ssize_t
9493PyUnicode_Count(PyObject *str,
9494 PyObject *substr,
9495 Py_ssize_t start,
9496 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009498 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009499 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 void *buf1 = NULL, *buf2 = NULL;
9501 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009502
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009503 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009505
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009506 kind1 = PyUnicode_KIND(str);
9507 kind2 = PyUnicode_KIND(substr);
9508 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009509 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009511 len1 = PyUnicode_GET_LENGTH(str);
9512 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009514 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009515 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 buf1 = PyUnicode_DATA(str);
9518 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009519 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009520 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521 if (!buf2)
9522 goto onError;
9523 }
9524
9525 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009527 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009528 result = asciilib_count(
9529 ((Py_UCS1*)buf1) + start, end - start,
9530 buf2, len2, PY_SSIZE_T_MAX
9531 );
9532 else
9533 result = ucs1lib_count(
9534 ((Py_UCS1*)buf1) + start, end - start,
9535 buf2, len2, PY_SSIZE_T_MAX
9536 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 break;
9538 case PyUnicode_2BYTE_KIND:
9539 result = ucs2lib_count(
9540 ((Py_UCS2*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
9543 break;
9544 case PyUnicode_4BYTE_KIND:
9545 result = ucs4lib_count(
9546 ((Py_UCS4*)buf1) + start, end - start,
9547 buf2, len2, PY_SSIZE_T_MAX
9548 );
9549 break;
9550 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009551 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009553
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009554 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 PyMem_Free(buf2);
9556
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009559 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 PyMem_Free(buf2);
9561 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562}
9563
Alexander Belopolsky40018472011-02-26 01:02:56 +00009564Py_ssize_t
9565PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009566 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009567 Py_ssize_t start,
9568 Py_ssize_t end,
9569 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009571 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009573
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009574 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575}
9576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577Py_ssize_t
9578PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9579 Py_ssize_t start, Py_ssize_t end,
9580 int direction)
9581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009583 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 if (PyUnicode_READY(str) == -1)
9585 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009586 len = PyUnicode_GET_LENGTH(str);
9587 ADJUST_INDICES(start, end, len);
9588 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009589 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009591 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9592 kind, end-start, ch, direction);
9593 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009595 else
9596 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597}
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009600tailmatch(PyObject *self,
9601 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602 Py_ssize_t start,
9603 Py_ssize_t end,
9604 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 int kind_self;
9607 int kind_sub;
9608 void *data_self;
9609 void *data_sub;
9610 Py_ssize_t offset;
9611 Py_ssize_t i;
9612 Py_ssize_t end_sub;
9613
9614 if (PyUnicode_READY(self) == -1 ||
9615 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009616 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9619 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009623 if (PyUnicode_GET_LENGTH(substring) == 0)
9624 return 1;
9625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 kind_self = PyUnicode_KIND(self);
9627 data_self = PyUnicode_DATA(self);
9628 kind_sub = PyUnicode_KIND(substring);
9629 data_sub = PyUnicode_DATA(substring);
9630 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9631
9632 if (direction > 0)
9633 offset = end;
9634 else
9635 offset = start;
9636
9637 if (PyUnicode_READ(kind_self, data_self, offset) ==
9638 PyUnicode_READ(kind_sub, data_sub, 0) &&
9639 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9640 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9641 /* If both are of the same kind, memcmp is sufficient */
9642 if (kind_self == kind_sub) {
9643 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009644 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 data_sub,
9646 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009647 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009649 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else {
9651 /* We do not need to compare 0 and len(substring)-1 because
9652 the if statement above ensured already that they are equal
9653 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 for (i = 1; i < end_sub; ++i) {
9655 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9656 PyUnicode_READ(kind_sub, data_sub, i))
9657 return 0;
9658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661 }
9662
9663 return 0;
9664}
9665
Alexander Belopolsky40018472011-02-26 01:02:56 +00009666Py_ssize_t
9667PyUnicode_Tailmatch(PyObject *str,
9668 PyObject *substr,
9669 Py_ssize_t start,
9670 Py_ssize_t end,
9671 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009673 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009675
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009676 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static PyObject *
9680ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9683 char *resdata, *data = PyUnicode_DATA(self);
9684 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 res = PyUnicode_New(len, 127);
9687 if (res == NULL)
9688 return NULL;
9689 resdata = PyUnicode_DATA(res);
9690 if (lower)
9691 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 _Py_bytes_upper(resdata, data, len);
9694 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695}
9696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 Py_ssize_t j;
9701 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009702 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009704
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9706
9707 where ! is a negation and \p{xxx} is a character with property xxx.
9708 */
9709 for (j = i - 1; j >= 0; j--) {
9710 c = PyUnicode_READ(kind, data, j);
9711 if (!_PyUnicode_IsCaseIgnorable(c))
9712 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9715 if (final_sigma) {
9716 for (j = i + 1; j < length; j++) {
9717 c = PyUnicode_READ(kind, data, j);
9718 if (!_PyUnicode_IsCaseIgnorable(c))
9719 break;
9720 }
9721 final_sigma = j == length || !_PyUnicode_IsCased(c);
9722 }
9723 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724}
9725
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726static int
9727lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9728 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 /* Obscure special case. */
9731 if (c == 0x3A3) {
9732 mapped[0] = handle_capital_sigma(kind, data, length, i);
9733 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738static Py_ssize_t
9739do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 Py_ssize_t i, k = 0;
9742 int n_res, j;
9743 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009744
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745 c = PyUnicode_READ(kind, data, 0);
9746 n_res = _PyUnicode_ToUpperFull(c, mapped);
9747 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009748 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 for (i = 1; i < length; i++) {
9752 c = PyUnicode_READ(kind, data, i);
9753 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009757 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009758 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760}
9761
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762static Py_ssize_t
9763do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9764 Py_ssize_t i, k = 0;
9765
9766 for (i = 0; i < length; i++) {
9767 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9768 int n_res, j;
9769 if (Py_UNICODE_ISUPPER(c)) {
9770 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9771 }
9772 else if (Py_UNICODE_ISLOWER(c)) {
9773 n_res = _PyUnicode_ToUpperFull(c, mapped);
9774 }
9775 else {
9776 n_res = 1;
9777 mapped[0] = c;
9778 }
9779 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009780 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009781 res[k++] = mapped[j];
9782 }
9783 }
9784 return k;
9785}
9786
9787static Py_ssize_t
9788do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9789 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791 Py_ssize_t i, k = 0;
9792
9793 for (i = 0; i < length; i++) {
9794 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9795 int n_res, j;
9796 if (lower)
9797 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9798 else
9799 n_res = _PyUnicode_ToUpperFull(c, mapped);
9800 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009801 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802 res[k++] = mapped[j];
9803 }
9804 }
9805 return k;
9806}
9807
9808static Py_ssize_t
9809do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9810{
9811 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9812}
9813
9814static Py_ssize_t
9815do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9816{
9817 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9818}
9819
Benjamin Petersone51757f2012-01-12 21:10:29 -05009820static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009821do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9822{
9823 Py_ssize_t i, k = 0;
9824
9825 for (i = 0; i < length; i++) {
9826 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009831 res[k++] = mapped[j];
9832 }
9833 }
9834 return k;
9835}
9836
9837static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009838do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9839{
9840 Py_ssize_t i, k = 0;
9841 int previous_is_cased;
9842
9843 previous_is_cased = 0;
9844 for (i = 0; i < length; i++) {
9845 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9846 Py_UCS4 mapped[3];
9847 int n_res, j;
9848
9849 if (previous_is_cased)
9850 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9851 else
9852 n_res = _PyUnicode_ToTitleFull(c, mapped);
9853
9854 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009855 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009856 res[k++] = mapped[j];
9857 }
9858
9859 previous_is_cased = _PyUnicode_IsCased(c);
9860 }
9861 return k;
9862}
9863
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864static PyObject *
9865case_operation(PyObject *self,
9866 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9867{
9868 PyObject *res = NULL;
9869 Py_ssize_t length, newlength = 0;
9870 int kind, outkind;
9871 void *data, *outdata;
9872 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9873
Benjamin Petersoneea48462012-01-16 14:28:50 -05009874 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875
9876 kind = PyUnicode_KIND(self);
9877 data = PyUnicode_DATA(self);
9878 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009879 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009880 PyErr_SetString(PyExc_OverflowError, "string is too long");
9881 return NULL;
9882 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009883 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884 if (tmp == NULL)
9885 return PyErr_NoMemory();
9886 newlength = perform(kind, data, length, tmp, &maxchar);
9887 res = PyUnicode_New(newlength, maxchar);
9888 if (res == NULL)
9889 goto leave;
9890 tmpend = tmp + newlength;
9891 outdata = PyUnicode_DATA(res);
9892 outkind = PyUnicode_KIND(res);
9893 switch (outkind) {
9894 case PyUnicode_1BYTE_KIND:
9895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9896 break;
9897 case PyUnicode_2BYTE_KIND:
9898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9899 break;
9900 case PyUnicode_4BYTE_KIND:
9901 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9902 break;
9903 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009904 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009905 }
9906 leave:
9907 PyMem_FREE(tmp);
9908 return res;
9909}
9910
Tim Peters8ce9f162004-08-27 01:49:32 +00009911PyObject *
9912PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009914 PyObject *res;
9915 PyObject *fseq;
9916 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009917 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009919 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009922 }
9923
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 /* NOTE: the following code can't call back into Python code,
9925 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009926 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009927
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009928 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009930 res = _PyUnicode_JoinArray(separator, items, seqlen);
9931 Py_DECREF(fseq);
9932 return res;
9933}
9934
9935PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009936_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009937{
9938 PyObject *res = NULL; /* the result */
9939 PyObject *sep = NULL;
9940 Py_ssize_t seplen;
9941 PyObject *item;
9942 Py_ssize_t sz, i, res_offset;
9943 Py_UCS4 maxchar;
9944 Py_UCS4 item_maxchar;
9945 int use_memcpy;
9946 unsigned char *res_data = NULL, *sep_data = NULL;
9947 PyObject *last_obj;
9948 unsigned int kind = 0;
9949
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 /* If empty sequence, return u"". */
9951 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009952 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009954
Tim Peters05eba1f2004-08-27 21:32:02 +00009955 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009956 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009957 if (seqlen == 1) {
9958 if (PyUnicode_CheckExact(items[0])) {
9959 res = items[0];
9960 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009961 return res;
9962 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009964 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009965 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009967 /* Set up sep and seplen */
9968 if (separator == NULL) {
9969 /* fall back to a blank space separator */
9970 sep = PyUnicode_FromOrdinal(' ');
9971 if (!sep)
9972 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009973 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009974 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009975 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009976 else {
9977 if (!PyUnicode_Check(separator)) {
9978 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009979 "separator: expected str instance,"
9980 " %.80s found",
9981 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 goto onError;
9983 }
9984 if (PyUnicode_READY(separator))
9985 goto onError;
9986 sep = separator;
9987 seplen = PyUnicode_GET_LENGTH(separator);
9988 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9989 /* inc refcount to keep this code path symmetric with the
9990 above case of a blank separator */
9991 Py_INCREF(sep);
9992 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009993 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009994 }
9995
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 /* There are at least two things to join, or else we have a subclass
9997 * of str in the sequence.
9998 * Do a pre-pass to figure out the total amount of space we'll
9999 * need (sz), and see whether all argument are strings.
10000 */
10001 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010002#ifdef Py_DEBUG
10003 use_memcpy = 0;
10004#else
10005 use_memcpy = 1;
10006#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010008 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010010 if (!PyUnicode_Check(item)) {
10011 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010012 "sequence item %zd: expected str instance,"
10013 " %.80s found",
10014 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 goto onError;
10016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (PyUnicode_READY(item) == -1)
10018 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010019 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010021 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010022 if (i != 0) {
10023 add_sz += seplen;
10024 }
10025 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010026 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010028 goto onError;
10029 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010030 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 if (use_memcpy && last_obj != NULL) {
10032 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10033 use_memcpy = 0;
10034 }
10035 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010036 }
Tim Petersced69f82003-09-16 20:30:58 +000010037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 if (res == NULL)
10040 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010041
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010043#ifdef Py_DEBUG
10044 use_memcpy = 0;
10045#else
10046 if (use_memcpy) {
10047 res_data = PyUnicode_1BYTE_DATA(res);
10048 kind = PyUnicode_KIND(res);
10049 if (seplen != 0)
10050 sep_data = PyUnicode_1BYTE_DATA(sep);
10051 }
10052#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010053 if (use_memcpy) {
10054 for (i = 0; i < seqlen; ++i) {
10055 Py_ssize_t itemlen;
10056 item = items[i];
10057
10058 /* Copy item, and maybe the separator. */
10059 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010060 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010061 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010062 kind * seplen);
10063 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010065
10066 itemlen = PyUnicode_GET_LENGTH(item);
10067 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010068 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 kind * itemlen);
10071 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010073 }
10074 assert(res_data == PyUnicode_1BYTE_DATA(res)
10075 + kind * PyUnicode_GET_LENGTH(res));
10076 }
10077 else {
10078 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10079 Py_ssize_t itemlen;
10080 item = items[i];
10081
10082 /* Copy item, and maybe the separator. */
10083 if (i && seplen != 0) {
10084 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10085 res_offset += seplen;
10086 }
10087
10088 itemlen = PyUnicode_GET_LENGTH(item);
10089 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010090 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010091 res_offset += itemlen;
10092 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010093 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010094 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010095 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010098 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010103 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104 return NULL;
10105}
10106
Victor Stinnerd3f08822012-05-29 12:57:52 +020010107void
10108_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10109 Py_UCS4 fill_char)
10110{
10111 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010112 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010113 assert(PyUnicode_IS_READY(unicode));
10114 assert(unicode_modifiable(unicode));
10115 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10116 assert(start >= 0);
10117 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010118 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119}
10120
Victor Stinner3fe55312012-01-04 00:33:50 +010010121Py_ssize_t
10122PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123 Py_UCS4 fill_char)
10124{
10125 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010126
10127 if (!PyUnicode_Check(unicode)) {
10128 PyErr_BadInternalCall();
10129 return -1;
10130 }
10131 if (PyUnicode_READY(unicode) == -1)
10132 return -1;
10133 if (unicode_check_modifiable(unicode))
10134 return -1;
10135
Victor Stinnerd3f08822012-05-29 12:57:52 +020010136 if (start < 0) {
10137 PyErr_SetString(PyExc_IndexError, "string index out of range");
10138 return -1;
10139 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010140 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10141 PyErr_SetString(PyExc_ValueError,
10142 "fill character is bigger than "
10143 "the string maximum character");
10144 return -1;
10145 }
10146
10147 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10148 length = Py_MIN(maxlen, length);
10149 if (length <= 0)
10150 return 0;
10151
Victor Stinnerd3f08822012-05-29 12:57:52 +020010152 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010153 return length;
10154}
10155
Victor Stinner9310abb2011-10-05 00:59:23 +020010156static PyObject *
10157pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010158 Py_ssize_t left,
10159 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyObject *u;
10163 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010164 int kind;
10165 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
10167 if (left < 0)
10168 left = 0;
10169 if (right < 0)
10170 right = 0;
10171
Victor Stinnerc4b49542011-12-11 22:44:26 +010010172 if (left == 0 && right == 0)
10173 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10176 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010177 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10178 return NULL;
10179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010181 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010183 if (!u)
10184 return NULL;
10185
10186 kind = PyUnicode_KIND(u);
10187 data = PyUnicode_DATA(u);
10188 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010189 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010190 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010191 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010192 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010193 assert(_PyUnicode_CheckConsistency(u, 1));
10194 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195}
10196
Alexander Belopolsky40018472011-02-26 01:02:56 +000010197PyObject *
10198PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010202 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Benjamin Petersonead6b532011-12-20 17:23:42 -060010205 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 if (PyUnicode_IS_ASCII(string))
10208 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 PyUnicode_GET_LENGTH(string), keepends);
10211 else
10212 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 break;
10216 case PyUnicode_2BYTE_KIND:
10217 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 PyUnicode_GET_LENGTH(string), keepends);
10220 break;
10221 case PyUnicode_4BYTE_KIND:
10222 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 PyUnicode_GET_LENGTH(string), keepends);
10225 break;
10226 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010227 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230}
10231
Alexander Belopolsky40018472011-02-26 01:02:56 +000010232static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010233split(PyObject *self,
10234 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010237 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 void *buf1, *buf2;
10239 Py_ssize_t len1, len2;
10240 PyObject* out;
10241
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010243 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (PyUnicode_READY(self) == -1)
10246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010249 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 if (PyUnicode_IS_ASCII(self))
10252 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
10256 else
10257 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 case PyUnicode_2BYTE_KIND:
10262 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
10266 case PyUnicode_4BYTE_KIND:
10267 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010268 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 PyUnicode_GET_LENGTH(self), maxcount
10270 );
10271 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010272 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 }
10274
10275 if (PyUnicode_READY(substring) == -1)
10276 return NULL;
10277
10278 kind1 = PyUnicode_KIND(self);
10279 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 len1 = PyUnicode_GET_LENGTH(self);
10281 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010282 if (kind1 < kind2 || len1 < len2) {
10283 out = PyList_New(1);
10284 if (out == NULL)
10285 return NULL;
10286 Py_INCREF(self);
10287 PyList_SET_ITEM(out, 0, self);
10288 return out;
10289 }
10290 buf1 = PyUnicode_DATA(self);
10291 buf2 = PyUnicode_DATA(substring);
10292 if (kind2 != kind1) {
10293 buf2 = _PyUnicode_AsKind(substring, kind1);
10294 if (!buf2)
10295 return NULL;
10296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010298 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10301 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010302 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010303 else
10304 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 break;
10307 case PyUnicode_2BYTE_KIND:
10308 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 case PyUnicode_4BYTE_KIND:
10312 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 default:
10316 out = NULL;
10317 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010318 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 PyMem_Free(buf2);
10320 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321}
10322
Alexander Belopolsky40018472011-02-26 01:02:56 +000010323static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010324rsplit(PyObject *self,
10325 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010326 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010327{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010328 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 void *buf1, *buf2;
10330 Py_ssize_t len1, len2;
10331 PyObject* out;
10332
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010333 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010334 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (PyUnicode_READY(self) == -1)
10337 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010340 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 if (PyUnicode_IS_ASCII(self))
10343 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
10347 else
10348 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 case PyUnicode_2BYTE_KIND:
10353 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
10357 case PyUnicode_4BYTE_KIND:
10358 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
10362 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010363 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 }
10365
10366 if (PyUnicode_READY(substring) == -1)
10367 return NULL;
10368
10369 kind1 = PyUnicode_KIND(self);
10370 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 len1 = PyUnicode_GET_LENGTH(self);
10372 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010373 if (kind1 < kind2 || len1 < len2) {
10374 out = PyList_New(1);
10375 if (out == NULL)
10376 return NULL;
10377 Py_INCREF(self);
10378 PyList_SET_ITEM(out, 0, self);
10379 return out;
10380 }
10381 buf1 = PyUnicode_DATA(self);
10382 buf2 = PyUnicode_DATA(substring);
10383 if (kind2 != kind1) {
10384 buf2 = _PyUnicode_AsKind(substring, kind1);
10385 if (!buf2)
10386 return NULL;
10387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010389 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10392 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394 else
10395 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010396 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 break;
10398 case PyUnicode_2BYTE_KIND:
10399 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 break;
10402 case PyUnicode_4BYTE_KIND:
10403 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 default:
10407 out = NULL;
10408 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010409 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 PyMem_Free(buf2);
10411 return out;
10412}
10413
10414static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010415anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10416 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010418 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10421 return asciilib_find(buf1, len1, buf2, len2, offset);
10422 else
10423 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 case PyUnicode_2BYTE_KIND:
10425 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10426 case PyUnicode_4BYTE_KIND:
10427 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10428 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010429 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430}
10431
10432static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10434 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010436 switch (kind) {
10437 case PyUnicode_1BYTE_KIND:
10438 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10439 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10440 else
10441 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10442 case PyUnicode_2BYTE_KIND:
10443 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10444 case PyUnicode_4BYTE_KIND:
10445 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10446 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010447 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010448}
10449
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450static void
10451replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10452 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10453{
10454 int kind = PyUnicode_KIND(u);
10455 void *data = PyUnicode_DATA(u);
10456 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10457 if (kind == PyUnicode_1BYTE_KIND) {
10458 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10459 (Py_UCS1 *)data + len,
10460 u1, u2, maxcount);
10461 }
10462 else if (kind == PyUnicode_2BYTE_KIND) {
10463 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10464 (Py_UCS2 *)data + len,
10465 u1, u2, maxcount);
10466 }
10467 else {
10468 assert(kind == PyUnicode_4BYTE_KIND);
10469 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10470 (Py_UCS4 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473}
10474
Alexander Belopolsky40018472011-02-26 01:02:56 +000010475static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476replace(PyObject *self, PyObject *str1,
10477 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 PyObject *u;
10480 char *sbuf = PyUnicode_DATA(self);
10481 char *buf1 = PyUnicode_DATA(str1);
10482 char *buf2 = PyUnicode_DATA(str2);
10483 int srelease = 0, release1 = 0, release2 = 0;
10484 int skind = PyUnicode_KIND(self);
10485 int kind1 = PyUnicode_KIND(str1);
10486 int kind2 = PyUnicode_KIND(str2);
10487 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10488 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10489 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010490 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010491 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010496 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
Victor Stinner59de0ee2011-10-07 10:01:28 +020010498 if (str1 == str2)
10499 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500
Victor Stinner49a0a212011-10-12 23:46:10 +020010501 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010502 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10503 if (maxchar < maxchar_str1)
10504 /* substring too wide to be present */
10505 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10507 /* Replacing str1 with str2 may cause a maxchar reduction in the
10508 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010510 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010515 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010519 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010520
Victor Stinner69ed0f42013-04-09 21:48:24 +020010521 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010522 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010523 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010525 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010529
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10531 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 }
10533 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 int rkind = skind;
10535 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010536 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 if (kind1 < rkind) {
10539 /* widen substring */
10540 buf1 = _PyUnicode_AsKind(str1, rkind);
10541 if (!buf1) goto error;
10542 release1 = 1;
10543 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010544 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 if (i < 0)
10546 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (rkind > kind2) {
10548 /* widen replacement */
10549 buf2 = _PyUnicode_AsKind(str2, rkind);
10550 if (!buf2) goto error;
10551 release2 = 1;
10552 }
10553 else if (rkind < kind2) {
10554 /* widen self and buf1 */
10555 rkind = kind2;
10556 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010557 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 sbuf = _PyUnicode_AsKind(self, rkind);
10559 if (!sbuf) goto error;
10560 srelease = 1;
10561 buf1 = _PyUnicode_AsKind(str1, rkind);
10562 if (!buf1) goto error;
10563 release1 = 1;
10564 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 u = PyUnicode_New(slen, maxchar);
10566 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 assert(PyUnicode_KIND(u) == rkind);
10569 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010570
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010571 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010572 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010575 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010577
10578 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010579 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010582 if (i == -1)
10583 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010590 }
10591 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010593 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 int rkind = skind;
10595 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 buf1 = _PyUnicode_AsKind(str1, rkind);
10600 if (!buf1) goto error;
10601 release1 = 1;
10602 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 if (n == 0)
10605 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf2 = _PyUnicode_AsKind(str2, rkind);
10609 if (!buf2) goto error;
10610 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 rkind = kind2;
10615 sbuf = _PyUnicode_AsKind(self, rkind);
10616 if (!sbuf) goto error;
10617 srelease = 1;
10618 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010619 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf1 = _PyUnicode_AsKind(str1, rkind);
10621 if (!buf1) goto error;
10622 release1 = 1;
10623 }
10624 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10625 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010626 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 PyErr_SetString(PyExc_OverflowError,
10628 "replace string is too long");
10629 goto error;
10630 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010631 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010633 _Py_INCREF_UNICODE_EMPTY();
10634 if (!unicode_empty)
10635 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 u = unicode_empty;
10637 goto done;
10638 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010639 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 PyErr_SetString(PyExc_OverflowError,
10641 "replace string is too long");
10642 goto error;
10643 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 u = PyUnicode_New(new_size, maxchar);
10645 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 assert(PyUnicode_KIND(u) == rkind);
10648 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 ires = i = 0;
10650 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 while (n-- > 0) {
10652 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010653 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010655 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010656 if (j == -1)
10657 break;
10658 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 memcpy(res + rkind * ires,
10661 sbuf + rkind * i,
10662 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 }
10665 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 memcpy(res + rkind * ires,
10677 sbuf + rkind * i,
10678 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010679 }
10680 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 /* interleave */
10682 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 if (--n <= 0)
10688 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
10690 sbuf + rkind * i,
10691 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 ires++;
10693 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 }
10700
10701 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010702 unicode_adjust_maxchar(&u);
10703 if (u == NULL)
10704 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010706
10707 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (srelease)
10709 PyMem_FREE(sbuf);
10710 if (release1)
10711 PyMem_FREE(buf1);
10712 if (release2)
10713 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010714 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010716
Benjamin Peterson29060642009-01-31 22:14:21 +000010717 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010718 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (srelease)
10720 PyMem_FREE(sbuf);
10721 if (release1)
10722 PyMem_FREE(buf1);
10723 if (release2)
10724 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010725 return unicode_result_unchanged(self);
10726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 error:
10728 if (srelease && sbuf)
10729 PyMem_FREE(sbuf);
10730 if (release1 && buf1)
10731 PyMem_FREE(buf1);
10732 if (release2 && buf2)
10733 PyMem_FREE(buf2);
10734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735}
10736
10737/* --- Unicode Object Methods --------------------------------------------- */
10738
INADA Naoki3ae20562017-01-16 20:41:20 +090010739/*[clinic input]
10740str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
INADA Naoki3ae20562017-01-16 20:41:20 +090010742Return a version of the string where each word is titlecased.
10743
10744More specifically, words start with uppercased characters and all remaining
10745cased characters have lower case.
10746[clinic start generated code]*/
10747
10748static PyObject *
10749unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010750/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010752 if (PyUnicode_READY(self) == -1)
10753 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010754 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755}
10756
INADA Naoki3ae20562017-01-16 20:41:20 +090010757/*[clinic input]
10758str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
INADA Naoki3ae20562017-01-16 20:41:20 +090010760Return a capitalized version of the string.
10761
10762More specifically, make the first character have upper case and the rest lower
10763case.
10764[clinic start generated code]*/
10765
10766static PyObject *
10767unicode_capitalize_impl(PyObject *self)
10768/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010770 if (PyUnicode_READY(self) == -1)
10771 return NULL;
10772 if (PyUnicode_GET_LENGTH(self) == 0)
10773 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010774 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
INADA Naoki3ae20562017-01-16 20:41:20 +090010777/*[clinic input]
10778str.casefold as unicode_casefold
10779
10780Return a version of the string suitable for caseless comparisons.
10781[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010782
10783static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010784unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010785/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010786{
10787 if (PyUnicode_READY(self) == -1)
10788 return NULL;
10789 if (PyUnicode_IS_ASCII(self))
10790 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010791 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792}
10793
10794
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010795/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010796
10797static int
10798convert_uc(PyObject *obj, void *addr)
10799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010802 if (!PyUnicode_Check(obj)) {
10803 PyErr_Format(PyExc_TypeError,
10804 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010805 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010806 return 0;
10807 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010808 if (PyUnicode_READY(obj) < 0)
10809 return 0;
10810 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010811 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 return 0;
10814 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010815 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010817}
10818
INADA Naoki3ae20562017-01-16 20:41:20 +090010819/*[clinic input]
10820str.center as unicode_center
10821
10822 width: Py_ssize_t
10823 fillchar: Py_UCS4 = ' '
10824 /
10825
10826Return a centered string of length width.
10827
10828Padding is done using the specified fill character (default is a space).
10829[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
10831static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010832unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10833/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010835 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
Benjamin Petersonbac79492012-01-14 13:34:47 -050010837 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838 return NULL;
10839
Victor Stinnerc4b49542011-12-11 22:44:26 +010010840 if (PyUnicode_GET_LENGTH(self) >= width)
10841 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Victor Stinnerc4b49542011-12-11 22:44:26 +010010843 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 left = marg / 2 + (marg & width & 1);
10845
Victor Stinner9310abb2011-10-05 00:59:23 +020010846 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847}
10848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849/* This function assumes that str1 and str2 are readied by the caller. */
10850
Marc-André Lemburge5034372000-08-08 08:04:29 +000010851static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010852unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010853{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010854#define COMPARE(TYPE1, TYPE2) \
10855 do { \
10856 TYPE1* p1 = (TYPE1 *)data1; \
10857 TYPE2* p2 = (TYPE2 *)data2; \
10858 TYPE1* end = p1 + len; \
10859 Py_UCS4 c1, c2; \
10860 for (; p1 != end; p1++, p2++) { \
10861 c1 = *p1; \
10862 c2 = *p2; \
10863 if (c1 != c2) \
10864 return (c1 < c2) ? -1 : 1; \
10865 } \
10866 } \
10867 while (0)
10868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 int kind1, kind2;
10870 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 kind1 = PyUnicode_KIND(str1);
10874 kind2 = PyUnicode_KIND(str2);
10875 data1 = PyUnicode_DATA(str1);
10876 data2 = PyUnicode_DATA(str2);
10877 len1 = PyUnicode_GET_LENGTH(str1);
10878 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010879 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010880
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 switch(kind1) {
10882 case PyUnicode_1BYTE_KIND:
10883 {
10884 switch(kind2) {
10885 case PyUnicode_1BYTE_KIND:
10886 {
10887 int cmp = memcmp(data1, data2, len);
10888 /* normalize result of memcmp() into the range [-1; 1] */
10889 if (cmp < 0)
10890 return -1;
10891 if (cmp > 0)
10892 return 1;
10893 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010894 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010895 case PyUnicode_2BYTE_KIND:
10896 COMPARE(Py_UCS1, Py_UCS2);
10897 break;
10898 case PyUnicode_4BYTE_KIND:
10899 COMPARE(Py_UCS1, Py_UCS4);
10900 break;
10901 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010902 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 }
10904 break;
10905 }
10906 case PyUnicode_2BYTE_KIND:
10907 {
10908 switch(kind2) {
10909 case PyUnicode_1BYTE_KIND:
10910 COMPARE(Py_UCS2, Py_UCS1);
10911 break;
10912 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 COMPARE(Py_UCS2, Py_UCS2);
10915 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010916 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010917 case PyUnicode_4BYTE_KIND:
10918 COMPARE(Py_UCS2, Py_UCS4);
10919 break;
10920 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010921 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 }
10923 break;
10924 }
10925 case PyUnicode_4BYTE_KIND:
10926 {
10927 switch(kind2) {
10928 case PyUnicode_1BYTE_KIND:
10929 COMPARE(Py_UCS4, Py_UCS1);
10930 break;
10931 case PyUnicode_2BYTE_KIND:
10932 COMPARE(Py_UCS4, Py_UCS2);
10933 break;
10934 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935 {
10936#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10937 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10938 /* normalize result of wmemcmp() into the range [-1; 1] */
10939 if (cmp < 0)
10940 return -1;
10941 if (cmp > 0)
10942 return 1;
10943#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010944 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010945#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010946 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010947 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010948 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010949 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 }
10951 break;
10952 }
10953 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010954 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010955 }
10956
Victor Stinner770e19e2012-10-04 22:59:45 +020010957 if (len1 == len2)
10958 return 0;
10959 if (len1 < len2)
10960 return -1;
10961 else
10962 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010963
10964#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010965}
10966
Benjamin Peterson621b4302016-09-09 13:54:34 -070010967static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010968unicode_compare_eq(PyObject *str1, PyObject *str2)
10969{
10970 int kind;
10971 void *data1, *data2;
10972 Py_ssize_t len;
10973 int cmp;
10974
Victor Stinnere5567ad2012-10-23 02:48:49 +020010975 len = PyUnicode_GET_LENGTH(str1);
10976 if (PyUnicode_GET_LENGTH(str2) != len)
10977 return 0;
10978 kind = PyUnicode_KIND(str1);
10979 if (PyUnicode_KIND(str2) != kind)
10980 return 0;
10981 data1 = PyUnicode_DATA(str1);
10982 data2 = PyUnicode_DATA(str2);
10983
10984 cmp = memcmp(data1, data2, len * kind);
10985 return (cmp == 0);
10986}
10987
10988
Alexander Belopolsky40018472011-02-26 01:02:56 +000010989int
10990PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10993 if (PyUnicode_READY(left) == -1 ||
10994 PyUnicode_READY(right) == -1)
10995 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010996
10997 /* a string is equal to itself */
10998 if (left == right)
10999 return 0;
11000
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011001 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011003 PyErr_Format(PyExc_TypeError,
11004 "Can't compare %.100s and %.100s",
11005 left->ob_type->tp_name,
11006 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 return -1;
11008}
11009
Martin v. Löwis5b222132007-06-10 09:51:05 +000011010int
11011PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 Py_ssize_t i;
11014 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011016 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017
Victor Stinner910337b2011-10-03 03:20:16 +020011018 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011019 if (!PyUnicode_IS_READY(uni)) {
11020 const wchar_t *ws = _PyUnicode_WSTR(uni);
11021 /* Compare Unicode string and source character set string */
11022 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11023 if (chr != ustr[i])
11024 return (chr < ustr[i]) ? -1 : 1;
11025 }
11026 /* This check keeps Python strings that end in '\0' from comparing equal
11027 to C strings identical up to that point. */
11028 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11029 return 1; /* uni is longer */
11030 if (ustr[i])
11031 return -1; /* str is longer */
11032 return 0;
11033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011035 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011036 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011037 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011038 size_t len, len2 = strlen(str);
11039 int cmp;
11040
11041 len = Py_MIN(len1, len2);
11042 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011043 if (cmp != 0) {
11044 if (cmp < 0)
11045 return -1;
11046 else
11047 return 1;
11048 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 if (len1 > len2)
11050 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011051 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011052 return -1; /* str is longer */
11053 return 0;
11054 }
11055 else {
11056 void *data = PyUnicode_DATA(uni);
11057 /* Compare Unicode string and source character set string */
11058 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011059 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11061 /* This check keeps Python strings that end in '\0' from comparing equal
11062 to C strings identical up to that point. */
11063 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11064 return 1; /* uni is longer */
11065 if (str[i])
11066 return -1; /* str is longer */
11067 return 0;
11068 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011069}
11070
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011071static int
11072non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11073{
11074 size_t i, len;
11075 const wchar_t *p;
11076 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11077 if (strlen(str) != len)
11078 return 0;
11079 p = _PyUnicode_WSTR(unicode);
11080 assert(p);
11081 for (i = 0; i < len; i++) {
11082 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011083 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011084 return 0;
11085 }
11086 return 1;
11087}
11088
11089int
11090_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11091{
11092 size_t len;
11093 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011094 assert(str);
11095#ifndef NDEBUG
11096 for (const char *p = str; *p; p++) {
11097 assert((unsigned char)*p < 128);
11098 }
11099#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011100 if (PyUnicode_READY(unicode) == -1) {
11101 /* Memory error or bad data */
11102 PyErr_Clear();
11103 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11104 }
11105 if (!PyUnicode_IS_ASCII(unicode))
11106 return 0;
11107 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11108 return strlen(str) == len &&
11109 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11110}
11111
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011112int
11113_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11114{
11115 PyObject *right_uni;
11116 Py_hash_t hash;
11117
11118 assert(_PyUnicode_CHECK(left));
11119 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011120#ifndef NDEBUG
11121 for (const char *p = right->string; *p; p++) {
11122 assert((unsigned char)*p < 128);
11123 }
11124#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011125
11126 if (PyUnicode_READY(left) == -1) {
11127 /* memory error or bad data */
11128 PyErr_Clear();
11129 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11130 }
11131
11132 if (!PyUnicode_IS_ASCII(left))
11133 return 0;
11134
11135 right_uni = _PyUnicode_FromId(right); /* borrowed */
11136 if (right_uni == NULL) {
11137 /* memory error or bad data */
11138 PyErr_Clear();
11139 return _PyUnicode_EqualToASCIIString(left, right->string);
11140 }
11141
11142 if (left == right_uni)
11143 return 1;
11144
11145 if (PyUnicode_CHECK_INTERNED(left))
11146 return 0;
11147
INADA Naoki7cc95f52018-01-28 02:07:09 +090011148 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011149 hash = _PyUnicode_HASH(left);
11150 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11151 return 0;
11152
11153 return unicode_compare_eq(left, right_uni);
11154}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011155
Alexander Belopolsky40018472011-02-26 01:02:56 +000011156PyObject *
11157PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011158{
11159 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011160
Victor Stinnere5567ad2012-10-23 02:48:49 +020011161 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11162 Py_RETURN_NOTIMPLEMENTED;
11163
11164 if (PyUnicode_READY(left) == -1 ||
11165 PyUnicode_READY(right) == -1)
11166 return NULL;
11167
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011168 if (left == right) {
11169 switch (op) {
11170 case Py_EQ:
11171 case Py_LE:
11172 case Py_GE:
11173 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011174 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011175 case Py_NE:
11176 case Py_LT:
11177 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011178 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 default:
11180 PyErr_BadArgument();
11181 return NULL;
11182 }
11183 }
11184 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011185 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011186 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011187 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011188 }
11189 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011190 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011191 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011192 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011193}
11194
Alexander Belopolsky40018472011-02-26 01:02:56 +000011195int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011196_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11197{
11198 return unicode_eq(aa, bb);
11199}
11200
11201int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011203{
Victor Stinner77282cb2013-04-14 19:22:47 +020011204 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 void *buf1, *buf2;
11206 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011207 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011208
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011211 "'in <string>' requires string as left operand, not %.100s",
11212 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011213 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011216 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 if (ensure_unicode(str) < 0)
11218 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 kind2 = PyUnicode_KIND(substr);
11222 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011223 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 len2 = PyUnicode_GET_LENGTH(substr);
11226 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011227 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011228 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011230 if (len2 == 1) {
11231 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11232 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011233 return result;
11234 }
11235 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 buf2 = _PyUnicode_AsKind(substr, kind1);
11237 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240
Victor Stinner77282cb2013-04-14 19:22:47 +020011241 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 case PyUnicode_1BYTE_KIND:
11243 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11244 break;
11245 case PyUnicode_2BYTE_KIND:
11246 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11247 break;
11248 case PyUnicode_4BYTE_KIND:
11249 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11250 break;
11251 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011252 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011254
Victor Stinner77282cb2013-04-14 19:22:47 +020011255 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 PyMem_Free(buf2);
11257
Guido van Rossum403d68b2000-03-13 15:55:09 +000011258 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011259}
11260
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261/* Concat to string or Unicode object giving a new Unicode object. */
11262
Alexander Belopolsky40018472011-02-26 01:02:56 +000011263PyObject *
11264PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011266 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011267 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011268 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011270 if (ensure_unicode(left) < 0)
11271 return NULL;
11272
11273 if (!PyUnicode_Check(right)) {
11274 PyErr_Format(PyExc_TypeError,
11275 "can only concatenate str (not \"%.200s\") to str",
11276 right->ob_type->tp_name);
11277 return NULL;
11278 }
11279 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 if (left == unicode_empty)
11284 return PyUnicode_FromObject(right);
11285 if (right == unicode_empty)
11286 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011288 left_len = PyUnicode_GET_LENGTH(left);
11289 right_len = PyUnicode_GET_LENGTH(right);
11290 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011291 PyErr_SetString(PyExc_OverflowError,
11292 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011293 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011294 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011296
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11298 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011299 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 result = PyUnicode_New(new_len, maxchar);
11303 if (result == NULL)
11304 return NULL;
11305 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11306 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11307 assert(_PyUnicode_CheckConsistency(result, 1));
11308 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309}
11310
Walter Dörwald1ab83302007-05-18 17:15:44 +000011311void
Victor Stinner23e56682011-10-03 03:54:37 +020011312PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011313{
Victor Stinner23e56682011-10-03 03:54:37 +020011314 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011315 Py_UCS4 maxchar, maxchar2;
11316 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011317
11318 if (p_left == NULL) {
11319 if (!PyErr_Occurred())
11320 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011321 return;
11322 }
Victor Stinner23e56682011-10-03 03:54:37 +020011323 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011324 if (right == NULL || left == NULL
11325 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011326 if (!PyErr_Occurred())
11327 PyErr_BadInternalCall();
11328 goto error;
11329 }
11330
Benjamin Petersonbac79492012-01-14 13:34:47 -050011331 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011332 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011333 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011334 goto error;
11335
Victor Stinner488fa492011-12-12 00:01:39 +010011336 /* Shortcuts */
11337 if (left == unicode_empty) {
11338 Py_DECREF(left);
11339 Py_INCREF(right);
11340 *p_left = right;
11341 return;
11342 }
11343 if (right == unicode_empty)
11344 return;
11345
11346 left_len = PyUnicode_GET_LENGTH(left);
11347 right_len = PyUnicode_GET_LENGTH(right);
11348 if (left_len > PY_SSIZE_T_MAX - right_len) {
11349 PyErr_SetString(PyExc_OverflowError,
11350 "strings are too large to concat");
11351 goto error;
11352 }
11353 new_len = left_len + right_len;
11354
11355 if (unicode_modifiable(left)
11356 && PyUnicode_CheckExact(right)
11357 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011358 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11359 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011360 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011361 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011362 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11363 {
11364 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011365 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011366 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011367
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011368 /* copy 'right' into the newly allocated area of 'left' */
11369 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011370 }
Victor Stinner488fa492011-12-12 00:01:39 +010011371 else {
11372 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11373 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011374 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011375
Victor Stinner488fa492011-12-12 00:01:39 +010011376 /* Concat the two Unicode strings */
11377 res = PyUnicode_New(new_len, maxchar);
11378 if (res == NULL)
11379 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011380 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11381 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011382 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011383 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011384 }
11385 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011386 return;
11387
11388error:
Victor Stinner488fa492011-12-12 00:01:39 +010011389 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011390}
11391
11392void
11393PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11394{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011395 PyUnicode_Append(pleft, right);
11396 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011397}
11398
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011399/*
11400Wraps stringlib_parse_args_finds() and additionally ensures that the
11401first argument is a unicode object.
11402*/
11403
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011404static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011405parse_args_finds_unicode(const char * function_name, PyObject *args,
11406 PyObject **substring,
11407 Py_ssize_t *start, Py_ssize_t *end)
11408{
11409 if(stringlib_parse_args_finds(function_name, args, substring,
11410 start, end)) {
11411 if (ensure_unicode(*substring) < 0)
11412 return 0;
11413 return 1;
11414 }
11415 return 0;
11416}
11417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011421Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011422string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011428 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011430 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 void *buf1, *buf2;
11434 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011436 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 kind1 = PyUnicode_KIND(self);
11440 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011442 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 len1 = PyUnicode_GET_LENGTH(self);
11445 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011448 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011450 buf1 = PyUnicode_DATA(self);
11451 buf2 = PyUnicode_DATA(substring);
11452 if (kind2 != kind1) {
11453 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 }
11457 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 case PyUnicode_1BYTE_KIND:
11459 iresult = ucs1lib_count(
11460 ((Py_UCS1*)buf1) + start, end - start,
11461 buf2, len2, PY_SSIZE_T_MAX
11462 );
11463 break;
11464 case PyUnicode_2BYTE_KIND:
11465 iresult = ucs2lib_count(
11466 ((Py_UCS2*)buf1) + start, end - start,
11467 buf2, len2, PY_SSIZE_T_MAX
11468 );
11469 break;
11470 case PyUnicode_4BYTE_KIND:
11471 iresult = ucs4lib_count(
11472 ((Py_UCS4*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011477 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 }
11479
11480 result = PyLong_FromSsize_t(iresult);
11481
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 return result;
11486}
11487
INADA Naoki3ae20562017-01-16 20:41:20 +090011488/*[clinic input]
11489str.encode as unicode_encode
11490
11491 encoding: str(c_default="NULL") = 'utf-8'
11492 The encoding in which to encode the string.
11493 errors: str(c_default="NULL") = 'strict'
11494 The error handling scheme to use for encoding errors.
11495 The default is 'strict' meaning that encoding errors raise a
11496 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11497 'xmlcharrefreplace' as well as any other name registered with
11498 codecs.register_error that can handle UnicodeEncodeErrors.
11499
11500Encode the string using the codec registered for encoding.
11501[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011504unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011505/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011507 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011508}
11509
INADA Naoki3ae20562017-01-16 20:41:20 +090011510/*[clinic input]
11511str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
INADA Naoki3ae20562017-01-16 20:41:20 +090011513 tabsize: int = 8
11514
11515Return a copy where all tab characters are expanded using spaces.
11516
11517If tabsize is not given, a tab size of 8 characters is assumed.
11518[clinic start generated code]*/
11519
11520static PyObject *
11521unicode_expandtabs_impl(PyObject *self, int tabsize)
11522/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011524 Py_ssize_t i, j, line_pos, src_len, incr;
11525 Py_UCS4 ch;
11526 PyObject *u;
11527 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011528 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011529 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
Antoine Pitrou22425222011-10-04 19:10:51 +020011531 if (PyUnicode_READY(self) == -1)
11532 return NULL;
11533
Thomas Wouters7e474022000-07-16 12:04:32 +000011534 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 src_len = PyUnicode_GET_LENGTH(self);
11536 i = j = line_pos = 0;
11537 kind = PyUnicode_KIND(self);
11538 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011539 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 for (; i < src_len; i++) {
11541 ch = PyUnicode_READ(kind, src_data, i);
11542 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011543 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011545 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011547 goto overflow;
11548 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011550 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 goto overflow;
11555 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011557 if (ch == '\n' || ch == '\r')
11558 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011561 if (!found)
11562 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011563
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 if (!u)
11567 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
Antoine Pitroue71d5742011-10-04 15:55:09 +020011570 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 for (; i < src_len; i++) {
11573 ch = PyUnicode_READ(kind, src_data, i);
11574 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 incr = tabsize - (line_pos % tabsize);
11577 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011578 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011579 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011581 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 line_pos++;
11584 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011585 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 if (ch == '\n' || ch == '\r')
11587 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 }
11590 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011591 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011592
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011594 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596}
11597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600\n\
11601Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011602such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603arguments start and end are interpreted as in slice notation.\n\
11604\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
11607static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011610 /* initialize variables to prevent gcc warning */
11611 PyObject *substring = NULL;
11612 Py_ssize_t start = 0;
11613 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011614 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011616 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011622 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (result == -2)
11625 return NULL;
11626
Christian Heimes217cfd12007-12-02 14:31:20 +000011627 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628}
11629
11630static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011631unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011633 void *data;
11634 enum PyUnicode_Kind kind;
11635 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011636
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011637 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011638 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011640 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011641 if (PyUnicode_READY(self) == -1) {
11642 return NULL;
11643 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11645 PyErr_SetString(PyExc_IndexError, "string index out of range");
11646 return NULL;
11647 }
11648 kind = PyUnicode_KIND(self);
11649 data = PyUnicode_DATA(self);
11650 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011651 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652}
11653
Guido van Rossumc2504932007-09-18 19:42:40 +000011654/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011655 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011656static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011657unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658{
Guido van Rossumc2504932007-09-18 19:42:40 +000011659 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011660 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011661
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011662#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011663 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011664#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (_PyUnicode_HASH(self) != -1)
11666 return _PyUnicode_HASH(self);
11667 if (PyUnicode_READY(self) == -1)
11668 return -1;
11669 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011670 /*
11671 We make the hash of the empty string be 0, rather than using
11672 (prefix ^ suffix), since this slightly obfuscates the hash secret
11673 */
11674 if (len == 0) {
11675 _PyUnicode_HASH(self) = 0;
11676 return 0;
11677 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011678 x = _Py_HashBytes(PyUnicode_DATA(self),
11679 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011681 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682}
11683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011684PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686\n\
oldkaa0735f2018-02-02 16:52:55 +080011687Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011688such that sub is contained within S[start:end]. Optional\n\
11689arguments start and end are interpreted as in slice notation.\n\
11690\n\
11691Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
11693static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011696 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011697 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011698 PyObject *substring = NULL;
11699 Py_ssize_t start = 0;
11700 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011702 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011705 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011708 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (result == -2)
11711 return NULL;
11712
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 if (result < 0) {
11714 PyErr_SetString(PyExc_ValueError, "substring not found");
11715 return NULL;
11716 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011717
Christian Heimes217cfd12007-12-02 14:31:20 +000011718 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719}
11720
INADA Naoki3ae20562017-01-16 20:41:20 +090011721/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011722str.isascii as unicode_isascii
11723
11724Return True if all characters in the string are ASCII, False otherwise.
11725
11726ASCII characters have code points in the range U+0000-U+007F.
11727Empty string is ASCII too.
11728[clinic start generated code]*/
11729
11730static PyObject *
11731unicode_isascii_impl(PyObject *self)
11732/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11733{
11734 if (PyUnicode_READY(self) == -1) {
11735 return NULL;
11736 }
11737 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11738}
11739
11740/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011741str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
INADA Naoki3ae20562017-01-16 20:41:20 +090011743Return True if the string is a lowercase string, False otherwise.
11744
11745A string is lowercase if all cased characters in the string are lowercase and
11746there is at least one cased character in the string.
11747[clinic start generated code]*/
11748
11749static PyObject *
11750unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011751/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 Py_ssize_t i, length;
11754 int kind;
11755 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 int cased;
11757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (PyUnicode_READY(self) == -1)
11759 return NULL;
11760 length = PyUnicode_GET_LENGTH(self);
11761 kind = PyUnicode_KIND(self);
11762 data = PyUnicode_DATA(self);
11763
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (length == 1)
11766 return PyBool_FromLong(
11767 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011771 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011772
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 for (i = 0; i < length; i++) {
11775 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011776
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011778 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 else if (!cased && Py_UNICODE_ISLOWER(ch))
11780 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011782 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783}
11784
INADA Naoki3ae20562017-01-16 20:41:20 +090011785/*[clinic input]
11786str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
INADA Naoki3ae20562017-01-16 20:41:20 +090011788Return True if the string is an uppercase string, False otherwise.
11789
11790A string is uppercase if all cased characters in the string are uppercase and
11791there is at least one cased character in the string.
11792[clinic start generated code]*/
11793
11794static PyObject *
11795unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011796/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 Py_ssize_t i, length;
11799 int kind;
11800 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 int cased;
11802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 if (PyUnicode_READY(self) == -1)
11804 return NULL;
11805 length = PyUnicode_GET_LENGTH(self);
11806 kind = PyUnicode_KIND(self);
11807 data = PyUnicode_DATA(self);
11808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 1)
11811 return PyBool_FromLong(
11812 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011814 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011816 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011817
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 for (i = 0; i < length; i++) {
11820 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011821
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011823 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 else if (!cased && Py_UNICODE_ISUPPER(ch))
11825 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011827 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828}
11829
INADA Naoki3ae20562017-01-16 20:41:20 +090011830/*[clinic input]
11831str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832
INADA Naoki3ae20562017-01-16 20:41:20 +090011833Return True if the string is a title-cased string, False otherwise.
11834
11835In a title-cased string, upper- and title-case characters may only
11836follow uncased characters and lowercase characters only cased ones.
11837[clinic start generated code]*/
11838
11839static PyObject *
11840unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011841/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 Py_ssize_t i, length;
11844 int kind;
11845 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 int cased, previous_is_cased;
11847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 if (PyUnicode_READY(self) == -1)
11849 return NULL;
11850 length = PyUnicode_GET_LENGTH(self);
11851 kind = PyUnicode_KIND(self);
11852 data = PyUnicode_DATA(self);
11853
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (length == 1) {
11856 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11857 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11858 (Py_UNICODE_ISUPPER(ch) != 0));
11859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011863 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 cased = 0;
11866 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 for (i = 0; i < length; i++) {
11868 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011869
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11871 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011872 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 previous_is_cased = 1;
11874 cased = 1;
11875 }
11876 else if (Py_UNICODE_ISLOWER(ch)) {
11877 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011878 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 previous_is_cased = 1;
11880 cased = 1;
11881 }
11882 else
11883 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011885 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
INADA Naoki3ae20562017-01-16 20:41:20 +090011888/*[clinic input]
11889str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
INADA Naoki3ae20562017-01-16 20:41:20 +090011891Return True if the string is a whitespace string, False otherwise.
11892
11893A string is whitespace if all characters in the string are whitespace and there
11894is at least one character in the string.
11895[clinic start generated code]*/
11896
11897static PyObject *
11898unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011899/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 Py_ssize_t i, length;
11902 int kind;
11903 void *data;
11904
11905 if (PyUnicode_READY(self) == -1)
11906 return NULL;
11907 length = PyUnicode_GET_LENGTH(self);
11908 kind = PyUnicode_KIND(self);
11909 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (length == 1)
11913 return PyBool_FromLong(
11914 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011916 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011918 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 for (i = 0; i < length; i++) {
11921 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011922 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011923 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011925 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926}
11927
INADA Naoki3ae20562017-01-16 20:41:20 +090011928/*[clinic input]
11929str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930
INADA Naoki3ae20562017-01-16 20:41:20 +090011931Return True if the string is an alphabetic string, False otherwise.
11932
11933A string is alphabetic if all characters in the string are alphabetic and there
11934is at least one character in the string.
11935[clinic start generated code]*/
11936
11937static PyObject *
11938unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011939/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 Py_ssize_t i, length;
11942 int kind;
11943 void *data;
11944
11945 if (PyUnicode_READY(self) == -1)
11946 return NULL;
11947 length = PyUnicode_GET_LENGTH(self);
11948 kind = PyUnicode_KIND(self);
11949 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011951 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (length == 1)
11953 return PyBool_FromLong(
11954 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011955
11956 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011958 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 for (i = 0; i < length; i++) {
11961 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011962 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011965}
11966
INADA Naoki3ae20562017-01-16 20:41:20 +090011967/*[clinic input]
11968str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969
INADA Naoki3ae20562017-01-16 20:41:20 +090011970Return True if the string is an alpha-numeric string, False otherwise.
11971
11972A string is alpha-numeric if all characters in the string are alpha-numeric and
11973there is at least one character in the string.
11974[clinic start generated code]*/
11975
11976static PyObject *
11977unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011978/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 int kind;
11981 void *data;
11982 Py_ssize_t len, i;
11983
11984 if (PyUnicode_READY(self) == -1)
11985 return NULL;
11986
11987 kind = PyUnicode_KIND(self);
11988 data = PyUnicode_DATA(self);
11989 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011991 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 if (len == 1) {
11993 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11994 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11995 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996
11997 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011999 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 for (i = 0; i < len; i++) {
12002 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012003 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012004 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012005 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012006 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012007}
12008
INADA Naoki3ae20562017-01-16 20:41:20 +090012009/*[clinic input]
12010str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
INADA Naoki3ae20562017-01-16 20:41:20 +090012012Return True if the string is a decimal string, False otherwise.
12013
12014A string is a decimal string if all characters in the string are decimal and
12015there is at least one character in the string.
12016[clinic start generated code]*/
12017
12018static PyObject *
12019unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012020/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 Py_ssize_t i, length;
12023 int kind;
12024 void *data;
12025
12026 if (PyUnicode_READY(self) == -1)
12027 return NULL;
12028 length = PyUnicode_GET_LENGTH(self);
12029 kind = PyUnicode_KIND(self);
12030 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (length == 1)
12034 return PyBool_FromLong(
12035 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012037 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012039 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 for (i = 0; i < length; i++) {
12042 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012043 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012045 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046}
12047
INADA Naoki3ae20562017-01-16 20:41:20 +090012048/*[clinic input]
12049str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
INADA Naoki3ae20562017-01-16 20:41:20 +090012051Return True if the string is a digit string, False otherwise.
12052
12053A string is a digit string if all characters in the string are digits and there
12054is at least one character in the string.
12055[clinic start generated code]*/
12056
12057static PyObject *
12058unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012059/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 Py_ssize_t i, length;
12062 int kind;
12063 void *data;
12064
12065 if (PyUnicode_READY(self) == -1)
12066 return NULL;
12067 length = PyUnicode_GET_LENGTH(self);
12068 kind = PyUnicode_KIND(self);
12069 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (length == 1) {
12073 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12074 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012077 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012079 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 for (i = 0; i < length; i++) {
12082 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012083 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012085 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086}
12087
INADA Naoki3ae20562017-01-16 20:41:20 +090012088/*[clinic input]
12089str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
INADA Naoki3ae20562017-01-16 20:41:20 +090012091Return True if the string is a numeric string, False otherwise.
12092
12093A string is numeric if all characters in the string are numeric and there is at
12094least one character in the string.
12095[clinic start generated code]*/
12096
12097static PyObject *
12098unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012099/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 Py_ssize_t i, length;
12102 int kind;
12103 void *data;
12104
12105 if (PyUnicode_READY(self) == -1)
12106 return NULL;
12107 length = PyUnicode_GET_LENGTH(self);
12108 kind = PyUnicode_KIND(self);
12109 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 if (length == 1)
12113 return PyBool_FromLong(
12114 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012116 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 for (i = 0; i < length; i++) {
12121 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012124 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125}
12126
Martin v. Löwis47383402007-08-15 07:32:56 +000012127int
12128PyUnicode_IsIdentifier(PyObject *self)
12129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 int kind;
12131 void *data;
12132 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012133 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 if (PyUnicode_READY(self) == -1) {
12136 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 }
12139
12140 /* Special case for empty strings */
12141 if (PyUnicode_GET_LENGTH(self) == 0)
12142 return 0;
12143 kind = PyUnicode_KIND(self);
12144 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012145
12146 /* PEP 3131 says that the first character must be in
12147 XID_Start and subsequent characters in XID_Continue,
12148 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012150 letters, digits, underscore). However, given the current
12151 definition of XID_Start and XID_Continue, it is sufficient
12152 to check just for these, except that _ must be allowed
12153 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012155 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012156 return 0;
12157
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012158 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012161 return 1;
12162}
12163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164/*[clinic input]
12165str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012166
INADA Naoki3ae20562017-01-16 20:41:20 +090012167Return True if the string is a valid Python identifier, False otherwise.
12168
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012169Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012170such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012171[clinic start generated code]*/
12172
12173static PyObject *
12174unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012175/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012176{
12177 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12178}
12179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180/*[clinic input]
12181str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012182
INADA Naoki3ae20562017-01-16 20:41:20 +090012183Return True if the string is printable, False otherwise.
12184
12185A string is printable if all of its characters are considered printable in
12186repr() or if it is empty.
12187[clinic start generated code]*/
12188
12189static PyObject *
12190unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012191/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 Py_ssize_t i, length;
12194 int kind;
12195 void *data;
12196
12197 if (PyUnicode_READY(self) == -1)
12198 return NULL;
12199 length = PyUnicode_GET_LENGTH(self);
12200 kind = PyUnicode_KIND(self);
12201 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012202
12203 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 if (length == 1)
12205 return PyBool_FromLong(
12206 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 for (i = 0; i < length; i++) {
12209 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012210 Py_RETURN_FALSE;
12211 }
12212 }
12213 Py_RETURN_TRUE;
12214}
12215
INADA Naoki3ae20562017-01-16 20:41:20 +090012216/*[clinic input]
12217str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
INADA Naoki3ae20562017-01-16 20:41:20 +090012219 iterable: object
12220 /
12221
12222Concatenate any number of strings.
12223
Martin Panter91a88662017-01-24 00:30:06 +000012224The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012225The result is returned as a new string.
12226
12227Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12228[clinic start generated code]*/
12229
12230static PyObject *
12231unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012232/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233{
INADA Naoki3ae20562017-01-16 20:41:20 +090012234 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Martin v. Löwis18e16552006-02-15 17:27:45 +000012237static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012238unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (PyUnicode_READY(self) == -1)
12241 return -1;
12242 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243}
12244
INADA Naoki3ae20562017-01-16 20:41:20 +090012245/*[clinic input]
12246str.ljust as unicode_ljust
12247
12248 width: Py_ssize_t
12249 fillchar: Py_UCS4 = ' '
12250 /
12251
12252Return a left-justified string of length width.
12253
12254Padding is done using the specified fill character (default is a space).
12255[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
12257static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012258unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12259/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012261 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
Victor Stinnerc4b49542011-12-11 22:44:26 +010012264 if (PyUnicode_GET_LENGTH(self) >= width)
12265 return unicode_result_unchanged(self);
12266
12267 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268}
12269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270/*[clinic input]
12271str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272
INADA Naoki3ae20562017-01-16 20:41:20 +090012273Return a copy of the string converted to lowercase.
12274[clinic start generated code]*/
12275
12276static PyObject *
12277unicode_lower_impl(PyObject *self)
12278/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012280 if (PyUnicode_READY(self) == -1)
12281 return NULL;
12282 if (PyUnicode_IS_ASCII(self))
12283 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012284 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285}
12286
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287#define LEFTSTRIP 0
12288#define RIGHTSTRIP 1
12289#define BOTHSTRIP 2
12290
12291/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012292static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293
INADA Naoki3ae20562017-01-16 20:41:20 +090012294#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296/* externally visible for str.strip(unicode) */
12297PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012298_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 void *data;
12301 int kind;
12302 Py_ssize_t i, j, len;
12303 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012304 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12307 return NULL;
12308
12309 kind = PyUnicode_KIND(self);
12310 data = PyUnicode_DATA(self);
12311 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012312 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12314 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012315 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012316
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 i = 0;
12318 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012319 while (i < len) {
12320 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12321 if (!BLOOM(sepmask, ch))
12322 break;
12323 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12324 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 i++;
12326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012328
Benjamin Peterson14339b62009-01-31 16:36:08 +000012329 j = len;
12330 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012331 j--;
12332 while (j >= i) {
12333 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12334 if (!BLOOM(sepmask, ch))
12335 break;
12336 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12337 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012339 }
12340
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012342 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012343
Victor Stinner7931d9a2011-11-04 00:22:48 +010012344 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345}
12346
12347PyObject*
12348PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12349{
12350 unsigned char *data;
12351 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012352 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353
Victor Stinnerde636f32011-10-01 03:55:54 +020012354 if (PyUnicode_READY(self) == -1)
12355 return NULL;
12356
Victor Stinner684d5fd2012-05-03 02:32:34 +020012357 length = PyUnicode_GET_LENGTH(self);
12358 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012359
Victor Stinner684d5fd2012-05-03 02:32:34 +020012360 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012361 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362
Victor Stinnerde636f32011-10-01 03:55:54 +020012363 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012364 PyErr_SetString(PyExc_IndexError, "string index out of range");
12365 return NULL;
12366 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012367 if (start >= length || end < start)
12368 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012369
Victor Stinner684d5fd2012-05-03 02:32:34 +020012370 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012371 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012372 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012373 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012374 }
12375 else {
12376 kind = PyUnicode_KIND(self);
12377 data = PyUnicode_1BYTE_DATA(self);
12378 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012379 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012380 length);
12381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
12384static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012385do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 Py_ssize_t len, i, j;
12388
12389 if (PyUnicode_READY(self) == -1)
12390 return NULL;
12391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012393
Victor Stinnercc7af722013-04-09 22:39:24 +020012394 if (PyUnicode_IS_ASCII(self)) {
12395 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12396
12397 i = 0;
12398 if (striptype != RIGHTSTRIP) {
12399 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012400 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012401 if (!_Py_ascii_whitespace[ch])
12402 break;
12403 i++;
12404 }
12405 }
12406
12407 j = len;
12408 if (striptype != LEFTSTRIP) {
12409 j--;
12410 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012411 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012412 if (!_Py_ascii_whitespace[ch])
12413 break;
12414 j--;
12415 }
12416 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012417 }
12418 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012419 else {
12420 int kind = PyUnicode_KIND(self);
12421 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012422
Victor Stinnercc7af722013-04-09 22:39:24 +020012423 i = 0;
12424 if (striptype != RIGHTSTRIP) {
12425 while (i < len) {
12426 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12427 if (!Py_UNICODE_ISSPACE(ch))
12428 break;
12429 i++;
12430 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012431 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012432
12433 j = len;
12434 if (striptype != LEFTSTRIP) {
12435 j--;
12436 while (j >= i) {
12437 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12438 if (!Py_UNICODE_ISSPACE(ch))
12439 break;
12440 j--;
12441 }
12442 j++;
12443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012445
Victor Stinner7931d9a2011-11-04 00:22:48 +010012446 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447}
12448
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449
12450static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012451do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012452{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 if (sep != NULL && sep != Py_None) {
12454 if (PyUnicode_Check(sep))
12455 return _PyUnicode_XStrip(self, striptype, sep);
12456 else {
12457 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 "%s arg must be None or str",
12459 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 return NULL;
12461 }
12462 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012465}
12466
12467
INADA Naoki3ae20562017-01-16 20:41:20 +090012468/*[clinic input]
12469str.strip as unicode_strip
12470
12471 chars: object = None
12472 /
12473
Victor Stinner0c4a8282017-01-17 02:21:47 +010012474Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012475
12476If chars is given and not None, remove characters in chars instead.
12477[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012478
12479static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012480unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012481/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012482{
INADA Naoki3ae20562017-01-16 20:41:20 +090012483 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484}
12485
12486
INADA Naoki3ae20562017-01-16 20:41:20 +090012487/*[clinic input]
12488str.lstrip as unicode_lstrip
12489
12490 chars: object = NULL
12491 /
12492
12493Return a copy of the string with leading whitespace removed.
12494
12495If chars is given and not None, remove characters in chars instead.
12496[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012497
12498static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012499unicode_lstrip_impl(PyObject *self, PyObject *chars)
12500/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012501{
INADA Naoki3ae20562017-01-16 20:41:20 +090012502 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012503}
12504
12505
INADA Naoki3ae20562017-01-16 20:41:20 +090012506/*[clinic input]
12507str.rstrip as unicode_rstrip
12508
12509 chars: object = NULL
12510 /
12511
12512Return a copy of the string with trailing whitespace removed.
12513
12514If chars is given and not None, remove characters in chars instead.
12515[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516
12517static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012518unicode_rstrip_impl(PyObject *self, PyObject *chars)
12519/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012520{
INADA Naoki3ae20562017-01-16 20:41:20 +090012521 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012522}
12523
12524
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Serhiy Storchaka05997252013-01-26 12:14:02 +020012531 if (len < 1)
12532 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 /* no repeat, return original string */
12535 if (len == 1)
12536 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012537
Benjamin Petersonbac79492012-01-14 13:34:47 -050012538 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 return NULL;
12540
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012541 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012542 PyErr_SetString(PyExc_OverflowError,
12543 "repeated string is too long");
12544 return NULL;
12545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012547
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012548 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 if (!u)
12550 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012551 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 if (PyUnicode_GET_LENGTH(str) == 1) {
12554 const int kind = PyUnicode_KIND(str);
12555 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012556 if (kind == PyUnicode_1BYTE_KIND) {
12557 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012558 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012559 }
12560 else if (kind == PyUnicode_2BYTE_KIND) {
12561 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012562 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012563 ucs2[n] = fill_char;
12564 } else {
12565 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12566 assert(kind == PyUnicode_4BYTE_KIND);
12567 for (n = 0; n < len; ++n)
12568 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 }
12571 else {
12572 /* number of characters copied this far */
12573 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012574 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012576 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012580 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012581 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583 }
12584
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012585 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012586 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587}
12588
Alexander Belopolsky40018472011-02-26 01:02:56 +000012589PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012590PyUnicode_Replace(PyObject *str,
12591 PyObject *substr,
12592 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012593 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012595 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12596 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012598 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599}
12600
INADA Naoki3ae20562017-01-16 20:41:20 +090012601/*[clinic input]
12602str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603
INADA Naoki3ae20562017-01-16 20:41:20 +090012604 old: unicode
12605 new: unicode
12606 count: Py_ssize_t = -1
12607 Maximum number of occurrences to replace.
12608 -1 (the default value) means replace all occurrences.
12609 /
12610
12611Return a copy with all occurrences of substring old replaced by new.
12612
12613If the optional argument count is given, only the first count occurrences are
12614replaced.
12615[clinic start generated code]*/
12616
12617static PyObject *
12618unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12619 Py_ssize_t count)
12620/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012622 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012624 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625}
12626
Alexander Belopolsky40018472011-02-26 01:02:56 +000012627static PyObject *
12628unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012630 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 Py_ssize_t isize;
12632 Py_ssize_t osize, squote, dquote, i, o;
12633 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012634 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012638 return NULL;
12639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 isize = PyUnicode_GET_LENGTH(unicode);
12641 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 /* Compute length of output, quote characters, and
12644 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012645 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 max = 127;
12647 squote = dquote = 0;
12648 ikind = PyUnicode_KIND(unicode);
12649 for (i = 0; i < isize; i++) {
12650 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 case '\'': squote++; break;
12654 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012656 incr = 2;
12657 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 default:
12659 /* Fast-path ASCII */
12660 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 ;
12664 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 if (osize > PY_SSIZE_T_MAX - incr) {
12674 PyErr_SetString(PyExc_OverflowError,
12675 "string is too long to generate repr");
12676 return NULL;
12677 }
12678 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 }
12680
12681 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012682 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012684 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (dquote)
12686 /* Both squote and dquote present. Use squote,
12687 and escape them */
12688 osize += squote;
12689 else
12690 quote = '"';
12691 }
Victor Stinner55c08782013-04-14 18:45:39 +020012692 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693
12694 repr = PyUnicode_New(osize, max);
12695 if (repr == NULL)
12696 return NULL;
12697 okind = PyUnicode_KIND(repr);
12698 odata = PyUnicode_DATA(repr);
12699
12700 PyUnicode_WRITE(okind, odata, 0, quote);
12701 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012702 if (unchanged) {
12703 _PyUnicode_FastCopyCharacters(repr, 1,
12704 unicode, 0,
12705 isize);
12706 }
12707 else {
12708 for (i = 0, o = 1; i < isize; i++) {
12709 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710
Victor Stinner55c08782013-04-14 18:45:39 +020012711 /* Escape quotes and backslashes */
12712 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012713 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012715 continue;
12716 }
12717
12718 /* Map special whitespace to '\t', \n', '\r' */
12719 if (ch == '\t') {
12720 PyUnicode_WRITE(okind, odata, o++, '\\');
12721 PyUnicode_WRITE(okind, odata, o++, 't');
12722 }
12723 else if (ch == '\n') {
12724 PyUnicode_WRITE(okind, odata, o++, '\\');
12725 PyUnicode_WRITE(okind, odata, o++, 'n');
12726 }
12727 else if (ch == '\r') {
12728 PyUnicode_WRITE(okind, odata, o++, '\\');
12729 PyUnicode_WRITE(okind, odata, o++, 'r');
12730 }
12731
12732 /* Map non-printable US ASCII to '\xhh' */
12733 else if (ch < ' ' || ch == 0x7F) {
12734 PyUnicode_WRITE(okind, odata, o++, '\\');
12735 PyUnicode_WRITE(okind, odata, o++, 'x');
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12738 }
12739
12740 /* Copy ASCII characters as-is */
12741 else if (ch < 0x7F) {
12742 PyUnicode_WRITE(okind, odata, o++, ch);
12743 }
12744
12745 /* Non-ASCII characters */
12746 else {
12747 /* Map Unicode whitespace and control characters
12748 (categories Z* and C* except ASCII space)
12749 */
12750 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12751 PyUnicode_WRITE(okind, odata, o++, '\\');
12752 /* Map 8-bit characters to '\xhh' */
12753 if (ch <= 0xff) {
12754 PyUnicode_WRITE(okind, odata, o++, 'x');
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12757 }
12758 /* Map 16-bit characters to '\uxxxx' */
12759 else if (ch <= 0xffff) {
12760 PyUnicode_WRITE(okind, odata, o++, 'u');
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12765 }
12766 /* Map 21-bit characters to '\U00xxxxxx' */
12767 else {
12768 PyUnicode_WRITE(okind, odata, o++, 'U');
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12777 }
12778 }
12779 /* Copy characters as-is */
12780 else {
12781 PyUnicode_WRITE(okind, odata, o++, ch);
12782 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012783 }
12784 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012787 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012788 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789}
12790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012791PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793\n\
12794Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012795such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796arguments start and end are interpreted as in slice notation.\n\
12797\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012798Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
12800static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012803 /* initialize variables to prevent gcc warning */
12804 PyObject *substring = NULL;
12805 Py_ssize_t start = 0;
12806 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012809 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012812 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012815 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 if (result == -2)
12818 return NULL;
12819
Christian Heimes217cfd12007-12-02 14:31:20 +000012820 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821}
12822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012823PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012826Return the highest index in S where substring sub is found,\n\
12827such that sub is contained within S[start:end]. Optional\n\
12828arguments start and end are interpreted as in slice notation.\n\
12829\n\
12830Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831
12832static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012835 /* initialize variables to prevent gcc warning */
12836 PyObject *substring = NULL;
12837 Py_ssize_t start = 0;
12838 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012841 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012844 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012847 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 if (result == -2)
12850 return NULL;
12851
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852 if (result < 0) {
12853 PyErr_SetString(PyExc_ValueError, "substring not found");
12854 return NULL;
12855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856
Christian Heimes217cfd12007-12-02 14:31:20 +000012857 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858}
12859
INADA Naoki3ae20562017-01-16 20:41:20 +090012860/*[clinic input]
12861str.rjust as unicode_rjust
12862
12863 width: Py_ssize_t
12864 fillchar: Py_UCS4 = ' '
12865 /
12866
12867Return a right-justified string of length width.
12868
12869Padding is done using the specified fill character (default is a space).
12870[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
12872static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012873unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12874/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012876 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877 return NULL;
12878
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 if (PyUnicode_GET_LENGTH(self) >= width)
12880 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
Victor Stinnerc4b49542011-12-11 22:44:26 +010012882 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
12884
Alexander Belopolsky40018472011-02-26 01:02:56 +000012885PyObject *
12886PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012891 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
INADA Naoki3ae20562017-01-16 20:41:20 +090012894/*[clinic input]
12895str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896
INADA Naoki3ae20562017-01-16 20:41:20 +090012897 sep: object = None
12898 The delimiter according which to split the string.
12899 None (the default value) means split according to any whitespace,
12900 and discard empty strings from the result.
12901 maxsplit: Py_ssize_t = -1
12902 Maximum number of splits to do.
12903 -1 (the default value) means no limit.
12904
12905Return a list of the words in the string, using sep as the delimiter string.
12906[clinic start generated code]*/
12907
12908static PyObject *
12909unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12910/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911{
INADA Naoki3ae20562017-01-16 20:41:20 +090012912 if (sep == Py_None)
12913 return split(self, NULL, maxsplit);
12914 if (PyUnicode_Check(sep))
12915 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012916
Victor Stinner998b8062018-09-12 00:23:25 +020012917 PyErr_Format(PyExc_TypeError,
12918 "must be str or None, not %.100s",
12919 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921}
12922
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012926 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012927 int kind1, kind2;
12928 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933
Victor Stinner14f8f022011-10-05 20:58:25 +020012934 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 len1 = PyUnicode_GET_LENGTH(str_obj);
12937 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 if (kind1 < kind2 || len1 < len2) {
12939 _Py_INCREF_UNICODE_EMPTY();
12940 if (!unicode_empty)
12941 out = NULL;
12942 else {
12943 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12944 Py_DECREF(unicode_empty);
12945 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012946 return out;
12947 }
12948 buf1 = PyUnicode_DATA(str_obj);
12949 buf2 = PyUnicode_DATA(sep_obj);
12950 if (kind2 != kind1) {
12951 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12952 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012953 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012958 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12959 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12960 else
12961 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 break;
12963 case PyUnicode_2BYTE_KIND:
12964 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 case PyUnicode_4BYTE_KIND:
12967 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968 break;
12969 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012970 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012973 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012975
12976 return out;
12977}
12978
12979
12980PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012981PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012982{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012984 int kind1, kind2;
12985 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012988 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012990
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012991 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 len1 = PyUnicode_GET_LENGTH(str_obj);
12994 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012995 if (kind1 < kind2 || len1 < len2) {
12996 _Py_INCREF_UNICODE_EMPTY();
12997 if (!unicode_empty)
12998 out = NULL;
12999 else {
13000 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13001 Py_DECREF(unicode_empty);
13002 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013003 return out;
13004 }
13005 buf1 = PyUnicode_DATA(str_obj);
13006 buf2 = PyUnicode_DATA(sep_obj);
13007 if (kind2 != kind1) {
13008 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13009 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013010 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013013 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013015 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13016 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13017 else
13018 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 break;
13020 case PyUnicode_2BYTE_KIND:
13021 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 break;
13023 case PyUnicode_4BYTE_KIND:
13024 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13025 break;
13026 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013027 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013029
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013030 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013032
13033 return out;
13034}
13035
INADA Naoki3ae20562017-01-16 20:41:20 +090013036/*[clinic input]
13037str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013038
INADA Naoki3ae20562017-01-16 20:41:20 +090013039 sep: object
13040 /
13041
13042Partition the string into three parts using the given separator.
13043
13044This will search for the separator in the string. If the separator is found,
13045returns a 3-tuple containing the part before the separator, the separator
13046itself, and the part after it.
13047
13048If the separator is not found, returns a 3-tuple containing the original string
13049and two empty strings.
13050[clinic start generated code]*/
13051
13052static PyObject *
13053unicode_partition(PyObject *self, PyObject *sep)
13054/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055{
INADA Naoki3ae20562017-01-16 20:41:20 +090013056 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057}
13058
INADA Naoki3ae20562017-01-16 20:41:20 +090013059/*[clinic input]
13060str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013061
INADA Naoki3ae20562017-01-16 20:41:20 +090013062Partition the string into three parts using the given separator.
13063
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013064This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013065the separator is found, returns a 3-tuple containing the part before the
13066separator, the separator itself, and the part after it.
13067
13068If the separator is not found, returns a 3-tuple containing two empty strings
13069and the original string.
13070[clinic start generated code]*/
13071
13072static PyObject *
13073unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013074/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013075{
INADA Naoki3ae20562017-01-16 20:41:20 +090013076 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013077}
13078
Alexander Belopolsky40018472011-02-26 01:02:56 +000013079PyObject *
13080PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013081{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013083 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013084
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013085 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086}
13087
INADA Naoki3ae20562017-01-16 20:41:20 +090013088/*[clinic input]
13089str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013090
INADA Naoki3ae20562017-01-16 20:41:20 +090013091Return a list of the words in the string, using sep as the delimiter string.
13092
13093Splits are done starting at the end of the string and working to the front.
13094[clinic start generated code]*/
13095
13096static PyObject *
13097unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13098/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013099{
INADA Naoki3ae20562017-01-16 20:41:20 +090013100 if (sep == Py_None)
13101 return rsplit(self, NULL, maxsplit);
13102 if (PyUnicode_Check(sep))
13103 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013104
Victor Stinner998b8062018-09-12 00:23:25 +020013105 PyErr_Format(PyExc_TypeError,
13106 "must be str or None, not %.100s",
13107 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013108 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013109}
13110
INADA Naoki3ae20562017-01-16 20:41:20 +090013111/*[clinic input]
13112str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013114 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013115
13116Return a list of the lines in the string, breaking at line boundaries.
13117
13118Line breaks are not included in the resulting list unless keepends is given and
13119true.
13120[clinic start generated code]*/
13121
13122static PyObject *
13123unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013124/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013126 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127}
13128
13129static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013130PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013132 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133}
13134
INADA Naoki3ae20562017-01-16 20:41:20 +090013135/*[clinic input]
13136str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137
INADA Naoki3ae20562017-01-16 20:41:20 +090013138Convert uppercase characters to lowercase and lowercase characters to uppercase.
13139[clinic start generated code]*/
13140
13141static PyObject *
13142unicode_swapcase_impl(PyObject *self)
13143/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013145 if (PyUnicode_READY(self) == -1)
13146 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013147 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148}
13149
Larry Hastings61272b72014-01-07 12:41:53 -080013150/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013151
Larry Hastings31826802013-10-19 00:09:25 -070013152@staticmethod
13153str.maketrans as unicode_maketrans
13154
13155 x: object
13156
13157 y: unicode=NULL
13158
13159 z: unicode=NULL
13160
13161 /
13162
13163Return a translation table usable for str.translate().
13164
13165If there is only one argument, it must be a dictionary mapping Unicode
13166ordinals (integers) or characters to Unicode ordinals, strings or None.
13167Character keys will be then converted to ordinals.
13168If there are two arguments, they must be strings of equal length, and
13169in the resulting dictionary, each character in x will be mapped to the
13170character at the same position in y. If there is a third argument, it
13171must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013172[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013173
Larry Hastings31826802013-10-19 00:09:25 -070013174static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013175unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013176/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013177{
Georg Brandlceee0772007-11-27 23:48:05 +000013178 PyObject *new = NULL, *key, *value;
13179 Py_ssize_t i = 0;
13180 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181
Georg Brandlceee0772007-11-27 23:48:05 +000013182 new = PyDict_New();
13183 if (!new)
13184 return NULL;
13185 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 int x_kind, y_kind, z_kind;
13187 void *x_data, *y_data, *z_data;
13188
Georg Brandlceee0772007-11-27 23:48:05 +000013189 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013190 if (!PyUnicode_Check(x)) {
13191 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13192 "be a string if there is a second argument");
13193 goto err;
13194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013196 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13197 "arguments must have equal length");
13198 goto err;
13199 }
13200 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 x_kind = PyUnicode_KIND(x);
13202 y_kind = PyUnicode_KIND(y);
13203 x_data = PyUnicode_DATA(x);
13204 y_data = PyUnicode_DATA(y);
13205 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13206 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013207 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013208 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013209 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013210 if (!value) {
13211 Py_DECREF(key);
13212 goto err;
13213 }
Georg Brandlceee0772007-11-27 23:48:05 +000013214 res = PyDict_SetItem(new, key, value);
13215 Py_DECREF(key);
13216 Py_DECREF(value);
13217 if (res < 0)
13218 goto err;
13219 }
13220 /* create entries for deleting chars in z */
13221 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 z_kind = PyUnicode_KIND(z);
13223 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013224 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013226 if (!key)
13227 goto err;
13228 res = PyDict_SetItem(new, key, Py_None);
13229 Py_DECREF(key);
13230 if (res < 0)
13231 goto err;
13232 }
13233 }
13234 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 int kind;
13236 void *data;
13237
Georg Brandlceee0772007-11-27 23:48:05 +000013238 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013239 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013240 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13241 "to maketrans it must be a dict");
13242 goto err;
13243 }
13244 /* copy entries into the new dict, converting string keys to int keys */
13245 while (PyDict_Next(x, &i, &key, &value)) {
13246 if (PyUnicode_Check(key)) {
13247 /* convert string keys to integer keys */
13248 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013249 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013250 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13251 "table must be of length 1");
13252 goto err;
13253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 kind = PyUnicode_KIND(key);
13255 data = PyUnicode_DATA(key);
13256 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013257 if (!newkey)
13258 goto err;
13259 res = PyDict_SetItem(new, newkey, value);
13260 Py_DECREF(newkey);
13261 if (res < 0)
13262 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013263 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013264 /* just keep integer keys */
13265 if (PyDict_SetItem(new, key, value) < 0)
13266 goto err;
13267 } else {
13268 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13269 "be strings or integers");
13270 goto err;
13271 }
13272 }
13273 }
13274 return new;
13275 err:
13276 Py_DECREF(new);
13277 return NULL;
13278}
13279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280/*[clinic input]
13281str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282
INADA Naoki3ae20562017-01-16 20:41:20 +090013283 table: object
13284 Translation table, which must be a mapping of Unicode ordinals to
13285 Unicode ordinals, strings, or None.
13286 /
13287
13288Replace each character in the string using the given translation table.
13289
13290The table must implement lookup/indexing via __getitem__, for instance a
13291dictionary or list. If this operation raises LookupError, the character is
13292left untouched. Characters mapped to None are deleted.
13293[clinic start generated code]*/
13294
13295static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013297/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300}
13301
INADA Naoki3ae20562017-01-16 20:41:20 +090013302/*[clinic input]
13303str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304
INADA Naoki3ae20562017-01-16 20:41:20 +090013305Return a copy of the string converted to uppercase.
13306[clinic start generated code]*/
13307
13308static PyObject *
13309unicode_upper_impl(PyObject *self)
13310/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013312 if (PyUnicode_READY(self) == -1)
13313 return NULL;
13314 if (PyUnicode_IS_ASCII(self))
13315 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013316 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317}
13318
INADA Naoki3ae20562017-01-16 20:41:20 +090013319/*[clinic input]
13320str.zfill as unicode_zfill
13321
13322 width: Py_ssize_t
13323 /
13324
13325Pad a numeric string with zeros on the left, to fill a field of the given width.
13326
13327The string is never truncated.
13328[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329
13330static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013331unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013332/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013334 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013335 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 int kind;
13337 void *data;
13338 Py_UCS4 chr;
13339
Benjamin Petersonbac79492012-01-14 13:34:47 -050013340 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342
Victor Stinnerc4b49542011-12-11 22:44:26 +010013343 if (PyUnicode_GET_LENGTH(self) >= width)
13344 return unicode_result_unchanged(self);
13345
13346 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347
13348 u = pad(self, fill, 0, '0');
13349
Walter Dörwald068325e2002-04-15 13:36:47 +000013350 if (u == NULL)
13351 return NULL;
13352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 kind = PyUnicode_KIND(u);
13354 data = PyUnicode_DATA(u);
13355 chr = PyUnicode_READ(kind, data, fill);
13356
13357 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 PyUnicode_WRITE(kind, data, 0, chr);
13360 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361 }
13362
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013363 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013364 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366
13367#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013368static PyObject *
13369unicode__decimal2ascii(PyObject *self)
13370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013372}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373#endif
13374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013375PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013378Return True if S starts with the specified prefix, False otherwise.\n\
13379With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013380With optional end, stop comparing S at that position.\n\
13381prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382
13383static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013384unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013388 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013389 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013390 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013391 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392
Jesus Ceaac451502011-04-20 17:09:23 +020013393 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013395 if (PyTuple_Check(subobj)) {
13396 Py_ssize_t i;
13397 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013398 substring = PyTuple_GET_ITEM(subobj, i);
13399 if (!PyUnicode_Check(substring)) {
13400 PyErr_Format(PyExc_TypeError,
13401 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013402 "not %.100s",
13403 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013407 if (result == -1)
13408 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013409 if (result) {
13410 Py_RETURN_TRUE;
13411 }
13412 }
13413 /* nothing matched */
13414 Py_RETURN_FALSE;
13415 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013416 if (!PyUnicode_Check(subobj)) {
13417 PyErr_Format(PyExc_TypeError,
13418 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013419 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013421 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013423 if (result == -1)
13424 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013425 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426}
13427
13428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013429PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013432Return True if S ends with the specified suffix, False otherwise.\n\
13433With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013434With optional end, stop comparing S at that position.\n\
13435suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436
13437static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013438unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013441 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013442 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013443 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013444 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013445 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446
Jesus Ceaac451502011-04-20 17:09:23 +020013447 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013449 if (PyTuple_Check(subobj)) {
13450 Py_ssize_t i;
13451 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013452 substring = PyTuple_GET_ITEM(subobj, i);
13453 if (!PyUnicode_Check(substring)) {
13454 PyErr_Format(PyExc_TypeError,
13455 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013456 "not %.100s",
13457 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013459 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013461 if (result == -1)
13462 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013463 if (result) {
13464 Py_RETURN_TRUE;
13465 }
13466 }
13467 Py_RETURN_FALSE;
13468 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013469 if (!PyUnicode_Check(subobj)) {
13470 PyErr_Format(PyExc_TypeError,
13471 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013472 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013474 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013475 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013476 if (result == -1)
13477 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013478 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013479}
13480
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013481static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013482_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013483{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013484 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13485 writer->data = PyUnicode_DATA(writer->buffer);
13486
13487 if (!writer->readonly) {
13488 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013489 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013490 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013491 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013492 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13493 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13494 writer->kind = PyUnicode_WCHAR_KIND;
13495 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13496
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497 /* Copy-on-write mode: set buffer size to 0 so
13498 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13499 * next write. */
13500 writer->size = 0;
13501 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013502}
13503
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013505_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013506{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013508
13509 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013510 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013511
13512 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13513 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13514 writer->kind = PyUnicode_WCHAR_KIND;
13515 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013516}
13517
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518int
13519_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13520 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013521{
13522 Py_ssize_t newlen;
13523 PyObject *newbuffer;
13524
Victor Stinner2740e462016-09-06 16:58:36 -070013525 assert(maxchar <= MAX_UNICODE);
13526
Victor Stinnerca9381e2015-09-22 00:58:32 +020013527 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013528 assert((maxchar > writer->maxchar && length >= 0)
13529 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530
Victor Stinner202fdca2012-05-07 12:47:02 +020013531 if (length > PY_SSIZE_T_MAX - writer->pos) {
13532 PyErr_NoMemory();
13533 return -1;
13534 }
13535 newlen = writer->pos + length;
13536
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013537 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013538
Victor Stinnerd3f08822012-05-29 12:57:52 +020013539 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013540 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013541 if (writer->overallocate
13542 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13543 /* overallocate to limit the number of realloc() */
13544 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013546 if (newlen < writer->min_length)
13547 newlen = writer->min_length;
13548
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 writer->buffer = PyUnicode_New(newlen, maxchar);
13550 if (writer->buffer == NULL)
13551 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013553 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013554 if (writer->overallocate
13555 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13556 /* overallocate to limit the number of realloc() */
13557 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 if (newlen < writer->min_length)
13560 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013562 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013563 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013564 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013565 newbuffer = PyUnicode_New(newlen, maxchar);
13566 if (newbuffer == NULL)
13567 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13569 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013570 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013571 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013572 }
13573 else {
13574 newbuffer = resize_compact(writer->buffer, newlen);
13575 if (newbuffer == NULL)
13576 return -1;
13577 }
13578 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013579 }
13580 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013581 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013582 newbuffer = PyUnicode_New(writer->size, maxchar);
13583 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013585 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13586 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013587 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013588 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013589 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013590 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013591
13592#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013593}
13594
Victor Stinnerca9381e2015-09-22 00:58:32 +020013595int
13596_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13597 enum PyUnicode_Kind kind)
13598{
13599 Py_UCS4 maxchar;
13600
13601 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13602 assert(writer->kind < kind);
13603
13604 switch (kind)
13605 {
13606 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13607 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13608 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13609 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013610 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013611 }
13612
13613 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13614}
13615
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013616static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013617_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013618{
Victor Stinner2740e462016-09-06 16:58:36 -070013619 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013620 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13621 return -1;
13622 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13623 writer->pos++;
13624 return 0;
13625}
13626
13627int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013628_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13629{
13630 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13631}
13632
13633int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13635{
13636 Py_UCS4 maxchar;
13637 Py_ssize_t len;
13638
13639 if (PyUnicode_READY(str) == -1)
13640 return -1;
13641 len = PyUnicode_GET_LENGTH(str);
13642 if (len == 0)
13643 return 0;
13644 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13645 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013646 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013647 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013648 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 Py_INCREF(str);
13650 writer->buffer = str;
13651 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 writer->pos += len;
13653 return 0;
13654 }
13655 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13656 return -1;
13657 }
13658 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13659 str, 0, len);
13660 writer->pos += len;
13661 return 0;
13662}
13663
Victor Stinnere215d962012-10-06 23:03:36 +020013664int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013665_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13666 Py_ssize_t start, Py_ssize_t end)
13667{
13668 Py_UCS4 maxchar;
13669 Py_ssize_t len;
13670
13671 if (PyUnicode_READY(str) == -1)
13672 return -1;
13673
13674 assert(0 <= start);
13675 assert(end <= PyUnicode_GET_LENGTH(str));
13676 assert(start <= end);
13677
13678 if (end == 0)
13679 return 0;
13680
13681 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13682 return _PyUnicodeWriter_WriteStr(writer, str);
13683
13684 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13685 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13686 else
13687 maxchar = writer->maxchar;
13688 len = end - start;
13689
13690 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13691 return -1;
13692
13693 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13694 str, start, len);
13695 writer->pos += len;
13696 return 0;
13697}
13698
13699int
Victor Stinner4a587072013-11-19 12:54:53 +010013700_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13701 const char *ascii, Py_ssize_t len)
13702{
13703 if (len == -1)
13704 len = strlen(ascii);
13705
13706 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13707
13708 if (writer->buffer == NULL && !writer->overallocate) {
13709 PyObject *str;
13710
13711 str = _PyUnicode_FromASCII(ascii, len);
13712 if (str == NULL)
13713 return -1;
13714
13715 writer->readonly = 1;
13716 writer->buffer = str;
13717 _PyUnicodeWriter_Update(writer);
13718 writer->pos += len;
13719 return 0;
13720 }
13721
13722 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13723 return -1;
13724
13725 switch (writer->kind)
13726 {
13727 case PyUnicode_1BYTE_KIND:
13728 {
13729 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13730 Py_UCS1 *data = writer->data;
13731
Christian Heimesf051e432016-09-13 20:22:02 +020013732 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013733 break;
13734 }
13735 case PyUnicode_2BYTE_KIND:
13736 {
13737 _PyUnicode_CONVERT_BYTES(
13738 Py_UCS1, Py_UCS2,
13739 ascii, ascii + len,
13740 (Py_UCS2 *)writer->data + writer->pos);
13741 break;
13742 }
13743 case PyUnicode_4BYTE_KIND:
13744 {
13745 _PyUnicode_CONVERT_BYTES(
13746 Py_UCS1, Py_UCS4,
13747 ascii, ascii + len,
13748 (Py_UCS4 *)writer->data + writer->pos);
13749 break;
13750 }
13751 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013752 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013753 }
13754
13755 writer->pos += len;
13756 return 0;
13757}
13758
13759int
13760_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13761 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013762{
13763 Py_UCS4 maxchar;
13764
13765 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13766 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13767 return -1;
13768 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13769 writer->pos += len;
13770 return 0;
13771}
13772
Victor Stinnerd3f08822012-05-29 12:57:52 +020013773PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013774_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013775{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013776 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013777
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013779 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013780 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013782
13783 str = writer->buffer;
13784 writer->buffer = NULL;
13785
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013786 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013787 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13788 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013790
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013791 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13792 PyObject *str2;
13793 str2 = resize_compact(str, writer->pos);
13794 if (str2 == NULL) {
13795 Py_DECREF(str);
13796 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013797 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013798 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013799 }
13800
Victor Stinner15a0bd32013-07-08 22:29:55 +020013801 assert(_PyUnicode_CheckConsistency(str, 1));
13802 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013803}
13804
Victor Stinnerd3f08822012-05-29 12:57:52 +020013805void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013806_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013807{
13808 Py_CLEAR(writer->buffer);
13809}
13810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013812
13813PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013815\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013816Return a formatted version of S, using substitutions from args and kwargs.\n\
13817The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013818
Eric Smith27bbca62010-11-04 17:06:58 +000013819PyDoc_STRVAR(format_map__doc__,
13820 "S.format_map(mapping) -> str\n\
13821\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013822Return a formatted version of S, using substitutions from mapping.\n\
13823The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013824
INADA Naoki3ae20562017-01-16 20:41:20 +090013825/*[clinic input]
13826str.__format__ as unicode___format__
13827
13828 format_spec: unicode
13829 /
13830
13831Return a formatted version of the string as described by format_spec.
13832[clinic start generated code]*/
13833
Eric Smith4a7d76d2008-05-30 18:10:19 +000013834static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013835unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013836/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013837{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 _PyUnicodeWriter writer;
13839 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013840
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 if (PyUnicode_READY(self) == -1)
13842 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013843 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13845 self, format_spec, 0,
13846 PyUnicode_GET_LENGTH(format_spec));
13847 if (ret == -1) {
13848 _PyUnicodeWriter_Dealloc(&writer);
13849 return NULL;
13850 }
13851 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013852}
13853
INADA Naoki3ae20562017-01-16 20:41:20 +090013854/*[clinic input]
13855str.__sizeof__ as unicode_sizeof
13856
13857Return the size of the string in memory, in bytes.
13858[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013859
13860static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013861unicode_sizeof_impl(PyObject *self)
13862/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 Py_ssize_t size;
13865
13866 /* If it's a compact object, account for base structure +
13867 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013868 if (PyUnicode_IS_COMPACT_ASCII(self))
13869 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13870 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013872 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013873 else {
13874 /* If it is a two-block object, account for base object, and
13875 for character block if present. */
13876 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013877 if (_PyUnicode_DATA_ANY(self))
13878 size += (PyUnicode_GET_LENGTH(self) + 1) *
13879 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880 }
13881 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013882 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13884 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13885 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13886 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887
13888 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013889}
13890
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013891static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013892unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013893{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013894 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 if (!copy)
13896 return NULL;
13897 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013898}
13899
Guido van Rossumd57fd912000-03-10 22:53:23 +000013900static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013901 UNICODE_ENCODE_METHODDEF
13902 UNICODE_REPLACE_METHODDEF
13903 UNICODE_SPLIT_METHODDEF
13904 UNICODE_RSPLIT_METHODDEF
13905 UNICODE_JOIN_METHODDEF
13906 UNICODE_CAPITALIZE_METHODDEF
13907 UNICODE_CASEFOLD_METHODDEF
13908 UNICODE_TITLE_METHODDEF
13909 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013910 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013911 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013912 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013913 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013914 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013915 UNICODE_LJUST_METHODDEF
13916 UNICODE_LOWER_METHODDEF
13917 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013918 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13919 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013920 UNICODE_RJUST_METHODDEF
13921 UNICODE_RSTRIP_METHODDEF
13922 UNICODE_RPARTITION_METHODDEF
13923 UNICODE_SPLITLINES_METHODDEF
13924 UNICODE_STRIP_METHODDEF
13925 UNICODE_SWAPCASE_METHODDEF
13926 UNICODE_TRANSLATE_METHODDEF
13927 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013928 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13929 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013930 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013931 UNICODE_ISLOWER_METHODDEF
13932 UNICODE_ISUPPER_METHODDEF
13933 UNICODE_ISTITLE_METHODDEF
13934 UNICODE_ISSPACE_METHODDEF
13935 UNICODE_ISDECIMAL_METHODDEF
13936 UNICODE_ISDIGIT_METHODDEF
13937 UNICODE_ISNUMERIC_METHODDEF
13938 UNICODE_ISALPHA_METHODDEF
13939 UNICODE_ISALNUM_METHODDEF
13940 UNICODE_ISIDENTIFIER_METHODDEF
13941 UNICODE_ISPRINTABLE_METHODDEF
13942 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013943 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013944 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013945 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013946 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013947 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013948#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013949 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013950 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951#endif
13952
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013953 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954 {NULL, NULL}
13955};
13956
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013957static PyObject *
13958unicode_mod(PyObject *v, PyObject *w)
13959{
Brian Curtindfc80e32011-08-10 20:28:54 -050013960 if (!PyUnicode_Check(v))
13961 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013962 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013963}
13964
13965static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 0, /*nb_add*/
13967 0, /*nb_subtract*/
13968 0, /*nb_multiply*/
13969 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013970};
13971
Guido van Rossumd57fd912000-03-10 22:53:23 +000013972static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 (lenfunc) unicode_length, /* sq_length */
13974 PyUnicode_Concat, /* sq_concat */
13975 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13976 (ssizeargfunc) unicode_getitem, /* sq_item */
13977 0, /* sq_slice */
13978 0, /* sq_ass_item */
13979 0, /* sq_ass_slice */
13980 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981};
13982
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013984unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986 if (PyUnicode_READY(self) == -1)
13987 return NULL;
13988
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013989 if (PyIndex_Check(item)) {
13990 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013991 if (i == -1 && PyErr_Occurred())
13992 return NULL;
13993 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013994 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013995 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013997 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013998 PyObject *result;
13999 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014000 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014001 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014003 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004 return NULL;
14005 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014006 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14007 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008
14009 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014010 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014012 slicelength == PyUnicode_GET_LENGTH(self)) {
14013 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014014 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014015 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014016 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014018 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014019 src_kind = PyUnicode_KIND(self);
14020 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014021 if (!PyUnicode_IS_ASCII(self)) {
14022 kind_limit = kind_maxchar_limit(src_kind);
14023 max_char = 0;
14024 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14025 ch = PyUnicode_READ(src_kind, src_data, cur);
14026 if (ch > max_char) {
14027 max_char = ch;
14028 if (max_char >= kind_limit)
14029 break;
14030 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014031 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 }
Victor Stinner55c99112011-10-13 01:17:06 +020014033 else
14034 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014035 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014036 if (result == NULL)
14037 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014039 dest_data = PyUnicode_DATA(result);
14040
14041 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014042 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14043 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014044 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014045 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014046 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014047 } else {
14048 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14049 return NULL;
14050 }
14051}
14052
14053static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 (lenfunc)unicode_length, /* mp_length */
14055 (binaryfunc)unicode_subscript, /* mp_subscript */
14056 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014057};
14058
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059
Guido van Rossumd57fd912000-03-10 22:53:23 +000014060/* Helpers for PyUnicode_Format() */
14061
Victor Stinnera47082312012-10-04 02:19:54 +020014062struct unicode_formatter_t {
14063 PyObject *args;
14064 int args_owned;
14065 Py_ssize_t arglen, argidx;
14066 PyObject *dict;
14067
14068 enum PyUnicode_Kind fmtkind;
14069 Py_ssize_t fmtcnt, fmtpos;
14070 void *fmtdata;
14071 PyObject *fmtstr;
14072
14073 _PyUnicodeWriter writer;
14074};
14075
14076struct unicode_format_arg_t {
14077 Py_UCS4 ch;
14078 int flags;
14079 Py_ssize_t width;
14080 int prec;
14081 int sign;
14082};
14083
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014085unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014086{
Victor Stinnera47082312012-10-04 02:19:54 +020014087 Py_ssize_t argidx = ctx->argidx;
14088
14089 if (argidx < ctx->arglen) {
14090 ctx->argidx++;
14091 if (ctx->arglen < 0)
14092 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 else
Victor Stinnera47082312012-10-04 02:19:54 +020014094 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 }
14096 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014097 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098 return NULL;
14099}
14100
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014101/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102
Victor Stinnera47082312012-10-04 02:19:54 +020014103/* Format a float into the writer if the writer is not NULL, or into *p_output
14104 otherwise.
14105
14106 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014107static int
Victor Stinnera47082312012-10-04 02:19:54 +020014108formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14109 PyObject **p_output,
14110 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014112 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014114 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014115 int prec;
14116 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014117
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 x = PyFloat_AsDouble(v);
14119 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014121
Victor Stinnera47082312012-10-04 02:19:54 +020014122 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014125
Victor Stinnera47082312012-10-04 02:19:54 +020014126 if (arg->flags & F_ALT)
14127 dtoa_flags = Py_DTSF_ALT;
14128 else
14129 dtoa_flags = 0;
14130 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014131 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014132 return -1;
14133 len = strlen(p);
14134 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014135 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014136 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014137 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014138 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014139 }
14140 else
14141 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014142 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144}
14145
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146/* formatlong() emulates the format codes d, u, o, x and X, and
14147 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14148 * Python's regular ints.
14149 * Return value: a new PyUnicodeObject*, or NULL if error.
14150 * The output string is of the form
14151 * "-"? ("0x" | "0X")? digit+
14152 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14153 * set in flags. The case of hex digits will be correct,
14154 * There will be at least prec digits, zero-filled on the left if
14155 * necessary to get that many.
14156 * val object to be converted
14157 * flags bitmask of format flags; only F_ALT is looked at
14158 * prec minimum number of digits; 0-fill on left if needed
14159 * type a character in [duoxX]; u acts the same as d
14160 *
14161 * CAUTION: o, x and X conversions on regular ints can never
14162 * produce a '-' sign, but can for Python's unbounded ints.
14163 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014164PyObject *
14165_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014166{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 Py_ssize_t i;
14170 int sign; /* 1 if '-', else 0 */
14171 int len; /* number of characters */
14172 Py_ssize_t llen;
14173 int numdigits; /* len == numnondigits + numdigits */
14174 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014175
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 /* Avoid exceeding SSIZE_T_MAX */
14177 if (prec > INT_MAX-3) {
14178 PyErr_SetString(PyExc_OverflowError,
14179 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 }
14182
14183 assert(PyLong_Check(val));
14184
14185 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014187 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014188 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014189 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014191 /* int and int subclasses should print numerically when a numeric */
14192 /* format code is used (see issue18780) */
14193 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 break;
14195 case 'o':
14196 numnondigits = 2;
14197 result = PyNumber_ToBase(val, 8);
14198 break;
14199 case 'x':
14200 case 'X':
14201 numnondigits = 2;
14202 result = PyNumber_ToBase(val, 16);
14203 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 }
14205 if (!result)
14206 return NULL;
14207
14208 assert(unicode_modifiable(result));
14209 assert(PyUnicode_IS_READY(result));
14210 assert(PyUnicode_IS_ASCII(result));
14211
14212 /* To modify the string in-place, there can only be one reference. */
14213 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014214 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014215 PyErr_BadInternalCall();
14216 return NULL;
14217 }
14218 buf = PyUnicode_DATA(result);
14219 llen = PyUnicode_GET_LENGTH(result);
14220 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014221 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014222 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014223 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014224 return NULL;
14225 }
14226 len = (int)llen;
14227 sign = buf[0] == '-';
14228 numnondigits += sign;
14229 numdigits = len - numnondigits;
14230 assert(numdigits > 0);
14231
14232 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014233 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014234 (type == 'o' || type == 'x' || type == 'X'))) {
14235 assert(buf[sign] == '0');
14236 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14237 buf[sign+1] == 'o');
14238 numnondigits -= 2;
14239 buf += 2;
14240 len -= 2;
14241 if (sign)
14242 buf[0] = '-';
14243 assert(len == numnondigits + numdigits);
14244 assert(numdigits > 0);
14245 }
14246
14247 /* Fill with leading zeroes to meet minimum width. */
14248 if (prec > numdigits) {
14249 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14250 numnondigits + prec);
14251 char *b1;
14252 if (!r1) {
14253 Py_DECREF(result);
14254 return NULL;
14255 }
14256 b1 = PyBytes_AS_STRING(r1);
14257 for (i = 0; i < numnondigits; ++i)
14258 *b1++ = *buf++;
14259 for (i = 0; i < prec - numdigits; i++)
14260 *b1++ = '0';
14261 for (i = 0; i < numdigits; i++)
14262 *b1++ = *buf++;
14263 *b1 = '\0';
14264 Py_DECREF(result);
14265 result = r1;
14266 buf = PyBytes_AS_STRING(result);
14267 len = numnondigits + prec;
14268 }
14269
14270 /* Fix up case for hex conversions. */
14271 if (type == 'X') {
14272 /* Need to convert all lower case letters to upper case.
14273 and need to convert 0x to 0X (and -0x to -0X). */
14274 for (i = 0; i < len; i++)
14275 if (buf[i] >= 'a' && buf[i] <= 'x')
14276 buf[i] -= 'a'-'A';
14277 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278 if (!PyUnicode_Check(result)
14279 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014280 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014281 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014282 Py_DECREF(result);
14283 result = unicode;
14284 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014285 else if (len != PyUnicode_GET_LENGTH(result)) {
14286 if (PyUnicode_Resize(&result, len) < 0)
14287 Py_CLEAR(result);
14288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014289 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014290}
14291
Ethan Furmandf3ed242014-01-05 06:50:30 -080014292/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014294 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014295 * -1 and raise an exception on error */
14296static int
Victor Stinnera47082312012-10-04 02:19:54 +020014297mainformatlong(PyObject *v,
14298 struct unicode_format_arg_t *arg,
14299 PyObject **p_output,
14300 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301{
14302 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014303 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304
14305 if (!PyNumber_Check(v))
14306 goto wrongtype;
14307
Ethan Furman9ab74802014-03-21 06:38:46 -070014308 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014309 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014310 if (type == 'o' || type == 'x' || type == 'X') {
14311 iobj = PyNumber_Index(v);
14312 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014313 if (PyErr_ExceptionMatches(PyExc_TypeError))
14314 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014315 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014316 }
14317 }
14318 else {
14319 iobj = PyNumber_Long(v);
14320 if (iobj == NULL ) {
14321 if (PyErr_ExceptionMatches(PyExc_TypeError))
14322 goto wrongtype;
14323 return -1;
14324 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014325 }
14326 assert(PyLong_Check(iobj));
14327 }
14328 else {
14329 iobj = v;
14330 Py_INCREF(iobj);
14331 }
14332
14333 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014334 && arg->width == -1 && arg->prec == -1
14335 && !(arg->flags & (F_SIGN | F_BLANK))
14336 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 {
14338 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014339 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014340 int base;
14341
Victor Stinnera47082312012-10-04 02:19:54 +020014342 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 {
14344 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014345 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 case 'd':
14347 case 'i':
14348 case 'u':
14349 base = 10;
14350 break;
14351 case 'o':
14352 base = 8;
14353 break;
14354 case 'x':
14355 case 'X':
14356 base = 16;
14357 break;
14358 }
14359
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014360 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14361 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014362 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014363 }
14364 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 return 1;
14366 }
14367
Ethan Furmanb95b5612015-01-23 20:05:18 -080014368 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 Py_DECREF(iobj);
14370 if (res == NULL)
14371 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014372 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014373 return 0;
14374
14375wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014376 switch(type)
14377 {
14378 case 'o':
14379 case 'x':
14380 case 'X':
14381 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014382 "%%%c format: an integer is required, "
14383 "not %.200s",
14384 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014385 break;
14386 default:
14387 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014388 "%%%c format: a number is required, "
14389 "not %.200s",
14390 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014391 break;
14392 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014393 return -1;
14394}
14395
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014396static Py_UCS4
14397formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014398{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014399 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014400 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014401 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014402 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 goto onError;
14405 }
14406 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014407 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014409 /* make sure number is a type of integer */
14410 if (!PyLong_Check(v)) {
14411 iobj = PyNumber_Index(v);
14412 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014413 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014414 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014415 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014416 Py_DECREF(iobj);
14417 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014418 else {
14419 x = PyLong_AsLong(v);
14420 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 if (x == -1 && PyErr_Occurred())
14422 goto onError;
14423
Victor Stinner8faf8212011-12-08 22:14:11 +010014424 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 PyErr_SetString(PyExc_OverflowError,
14426 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014427 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 }
14429
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014430 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014432
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014434 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014435 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014436 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014437}
14438
Victor Stinnera47082312012-10-04 02:19:54 +020014439/* Parse options of an argument: flags, width, precision.
14440 Handle also "%(name)" syntax.
14441
14442 Return 0 if the argument has been formatted into arg->str.
14443 Return 1 if the argument has been written into ctx->writer,
14444 Raise an exception and return -1 on error. */
14445static int
14446unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14447 struct unicode_format_arg_t *arg)
14448{
14449#define FORMAT_READ(ctx) \
14450 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14451
14452 PyObject *v;
14453
Victor Stinnera47082312012-10-04 02:19:54 +020014454 if (arg->ch == '(') {
14455 /* Get argument value from a dictionary. Example: "%(name)s". */
14456 Py_ssize_t keystart;
14457 Py_ssize_t keylen;
14458 PyObject *key;
14459 int pcount = 1;
14460
14461 if (ctx->dict == NULL) {
14462 PyErr_SetString(PyExc_TypeError,
14463 "format requires a mapping");
14464 return -1;
14465 }
14466 ++ctx->fmtpos;
14467 --ctx->fmtcnt;
14468 keystart = ctx->fmtpos;
14469 /* Skip over balanced parentheses */
14470 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14471 arg->ch = FORMAT_READ(ctx);
14472 if (arg->ch == ')')
14473 --pcount;
14474 else if (arg->ch == '(')
14475 ++pcount;
14476 ctx->fmtpos++;
14477 }
14478 keylen = ctx->fmtpos - keystart - 1;
14479 if (ctx->fmtcnt < 0 || pcount > 0) {
14480 PyErr_SetString(PyExc_ValueError,
14481 "incomplete format key");
14482 return -1;
14483 }
14484 key = PyUnicode_Substring(ctx->fmtstr,
14485 keystart, keystart + keylen);
14486 if (key == NULL)
14487 return -1;
14488 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014489 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014490 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014491 }
14492 ctx->args = PyObject_GetItem(ctx->dict, key);
14493 Py_DECREF(key);
14494 if (ctx->args == NULL)
14495 return -1;
14496 ctx->args_owned = 1;
14497 ctx->arglen = -1;
14498 ctx->argidx = -2;
14499 }
14500
14501 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014502 while (--ctx->fmtcnt >= 0) {
14503 arg->ch = FORMAT_READ(ctx);
14504 ctx->fmtpos++;
14505 switch (arg->ch) {
14506 case '-': arg->flags |= F_LJUST; continue;
14507 case '+': arg->flags |= F_SIGN; continue;
14508 case ' ': arg->flags |= F_BLANK; continue;
14509 case '#': arg->flags |= F_ALT; continue;
14510 case '0': arg->flags |= F_ZERO; continue;
14511 }
14512 break;
14513 }
14514
14515 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014516 if (arg->ch == '*') {
14517 v = unicode_format_getnextarg(ctx);
14518 if (v == NULL)
14519 return -1;
14520 if (!PyLong_Check(v)) {
14521 PyErr_SetString(PyExc_TypeError,
14522 "* wants int");
14523 return -1;
14524 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014525 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014526 if (arg->width == -1 && PyErr_Occurred())
14527 return -1;
14528 if (arg->width < 0) {
14529 arg->flags |= F_LJUST;
14530 arg->width = -arg->width;
14531 }
14532 if (--ctx->fmtcnt >= 0) {
14533 arg->ch = FORMAT_READ(ctx);
14534 ctx->fmtpos++;
14535 }
14536 }
14537 else if (arg->ch >= '0' && arg->ch <= '9') {
14538 arg->width = arg->ch - '0';
14539 while (--ctx->fmtcnt >= 0) {
14540 arg->ch = FORMAT_READ(ctx);
14541 ctx->fmtpos++;
14542 if (arg->ch < '0' || arg->ch > '9')
14543 break;
14544 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14545 mixing signed and unsigned comparison. Since arg->ch is between
14546 '0' and '9', casting to int is safe. */
14547 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14548 PyErr_SetString(PyExc_ValueError,
14549 "width too big");
14550 return -1;
14551 }
14552 arg->width = arg->width*10 + (arg->ch - '0');
14553 }
14554 }
14555
14556 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014557 if (arg->ch == '.') {
14558 arg->prec = 0;
14559 if (--ctx->fmtcnt >= 0) {
14560 arg->ch = FORMAT_READ(ctx);
14561 ctx->fmtpos++;
14562 }
14563 if (arg->ch == '*') {
14564 v = unicode_format_getnextarg(ctx);
14565 if (v == NULL)
14566 return -1;
14567 if (!PyLong_Check(v)) {
14568 PyErr_SetString(PyExc_TypeError,
14569 "* wants int");
14570 return -1;
14571 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014572 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014573 if (arg->prec == -1 && PyErr_Occurred())
14574 return -1;
14575 if (arg->prec < 0)
14576 arg->prec = 0;
14577 if (--ctx->fmtcnt >= 0) {
14578 arg->ch = FORMAT_READ(ctx);
14579 ctx->fmtpos++;
14580 }
14581 }
14582 else if (arg->ch >= '0' && arg->ch <= '9') {
14583 arg->prec = arg->ch - '0';
14584 while (--ctx->fmtcnt >= 0) {
14585 arg->ch = FORMAT_READ(ctx);
14586 ctx->fmtpos++;
14587 if (arg->ch < '0' || arg->ch > '9')
14588 break;
14589 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14590 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014591 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014592 return -1;
14593 }
14594 arg->prec = arg->prec*10 + (arg->ch - '0');
14595 }
14596 }
14597 }
14598
14599 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14600 if (ctx->fmtcnt >= 0) {
14601 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14602 if (--ctx->fmtcnt >= 0) {
14603 arg->ch = FORMAT_READ(ctx);
14604 ctx->fmtpos++;
14605 }
14606 }
14607 }
14608 if (ctx->fmtcnt < 0) {
14609 PyErr_SetString(PyExc_ValueError,
14610 "incomplete format");
14611 return -1;
14612 }
14613 return 0;
14614
14615#undef FORMAT_READ
14616}
14617
14618/* Format one argument. Supported conversion specifiers:
14619
14620 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014621 - "i", "d", "u": int or float
14622 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014623 - "e", "E", "f", "F", "g", "G": float
14624 - "c": int or str (1 character)
14625
Victor Stinner8dbd4212012-12-04 09:30:24 +010014626 When possible, the output is written directly into the Unicode writer
14627 (ctx->writer). A string is created when padding is required.
14628
Victor Stinnera47082312012-10-04 02:19:54 +020014629 Return 0 if the argument has been formatted into *p_str,
14630 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014631 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014632static int
14633unicode_format_arg_format(struct unicode_formatter_t *ctx,
14634 struct unicode_format_arg_t *arg,
14635 PyObject **p_str)
14636{
14637 PyObject *v;
14638 _PyUnicodeWriter *writer = &ctx->writer;
14639
14640 if (ctx->fmtcnt == 0)
14641 ctx->writer.overallocate = 0;
14642
Victor Stinnera47082312012-10-04 02:19:54 +020014643 v = unicode_format_getnextarg(ctx);
14644 if (v == NULL)
14645 return -1;
14646
Victor Stinnera47082312012-10-04 02:19:54 +020014647
14648 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014649 case 's':
14650 case 'r':
14651 case 'a':
14652 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14653 /* Fast path */
14654 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14655 return -1;
14656 return 1;
14657 }
14658
14659 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14660 *p_str = v;
14661 Py_INCREF(*p_str);
14662 }
14663 else {
14664 if (arg->ch == 's')
14665 *p_str = PyObject_Str(v);
14666 else if (arg->ch == 'r')
14667 *p_str = PyObject_Repr(v);
14668 else
14669 *p_str = PyObject_ASCII(v);
14670 }
14671 break;
14672
14673 case 'i':
14674 case 'd':
14675 case 'u':
14676 case 'o':
14677 case 'x':
14678 case 'X':
14679 {
14680 int ret = mainformatlong(v, arg, p_str, writer);
14681 if (ret != 0)
14682 return ret;
14683 arg->sign = 1;
14684 break;
14685 }
14686
14687 case 'e':
14688 case 'E':
14689 case 'f':
14690 case 'F':
14691 case 'g':
14692 case 'G':
14693 if (arg->width == -1 && arg->prec == -1
14694 && !(arg->flags & (F_SIGN | F_BLANK)))
14695 {
14696 /* Fast path */
14697 if (formatfloat(v, arg, NULL, writer) == -1)
14698 return -1;
14699 return 1;
14700 }
14701
14702 arg->sign = 1;
14703 if (formatfloat(v, arg, p_str, NULL) == -1)
14704 return -1;
14705 break;
14706
14707 case 'c':
14708 {
14709 Py_UCS4 ch = formatchar(v);
14710 if (ch == (Py_UCS4) -1)
14711 return -1;
14712 if (arg->width == -1 && arg->prec == -1) {
14713 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014714 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014715 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014716 return 1;
14717 }
14718 *p_str = PyUnicode_FromOrdinal(ch);
14719 break;
14720 }
14721
14722 default:
14723 PyErr_Format(PyExc_ValueError,
14724 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014725 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014726 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14727 (int)arg->ch,
14728 ctx->fmtpos - 1);
14729 return -1;
14730 }
14731 if (*p_str == NULL)
14732 return -1;
14733 assert (PyUnicode_Check(*p_str));
14734 return 0;
14735}
14736
14737static int
14738unicode_format_arg_output(struct unicode_formatter_t *ctx,
14739 struct unicode_format_arg_t *arg,
14740 PyObject *str)
14741{
14742 Py_ssize_t len;
14743 enum PyUnicode_Kind kind;
14744 void *pbuf;
14745 Py_ssize_t pindex;
14746 Py_UCS4 signchar;
14747 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014748 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014749 Py_ssize_t sublen;
14750 _PyUnicodeWriter *writer = &ctx->writer;
14751 Py_UCS4 fill;
14752
14753 fill = ' ';
14754 if (arg->sign && arg->flags & F_ZERO)
14755 fill = '0';
14756
14757 if (PyUnicode_READY(str) == -1)
14758 return -1;
14759
14760 len = PyUnicode_GET_LENGTH(str);
14761 if ((arg->width == -1 || arg->width <= len)
14762 && (arg->prec == -1 || arg->prec >= len)
14763 && !(arg->flags & (F_SIGN | F_BLANK)))
14764 {
14765 /* Fast path */
14766 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14767 return -1;
14768 return 0;
14769 }
14770
14771 /* Truncate the string for "s", "r" and "a" formats
14772 if the precision is set */
14773 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14774 if (arg->prec >= 0 && len > arg->prec)
14775 len = arg->prec;
14776 }
14777
14778 /* Adjust sign and width */
14779 kind = PyUnicode_KIND(str);
14780 pbuf = PyUnicode_DATA(str);
14781 pindex = 0;
14782 signchar = '\0';
14783 if (arg->sign) {
14784 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14785 if (ch == '-' || ch == '+') {
14786 signchar = ch;
14787 len--;
14788 pindex++;
14789 }
14790 else if (arg->flags & F_SIGN)
14791 signchar = '+';
14792 else if (arg->flags & F_BLANK)
14793 signchar = ' ';
14794 else
14795 arg->sign = 0;
14796 }
14797 if (arg->width < len)
14798 arg->width = len;
14799
14800 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014801 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014802 if (!(arg->flags & F_LJUST)) {
14803 if (arg->sign) {
14804 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014805 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 else {
14808 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014809 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014810 }
14811 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14813 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014814 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014815 }
14816
Victor Stinnera47082312012-10-04 02:19:54 +020014817 buflen = arg->width;
14818 if (arg->sign && len == arg->width)
14819 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014820 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014821 return -1;
14822
14823 /* Write the sign if needed */
14824 if (arg->sign) {
14825 if (fill != ' ') {
14826 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14827 writer->pos += 1;
14828 }
14829 if (arg->width > len)
14830 arg->width--;
14831 }
14832
14833 /* Write the numeric prefix for "x", "X" and "o" formats
14834 if the alternate form is used.
14835 For example, write "0x" for the "%#x" format. */
14836 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14837 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14838 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14839 if (fill != ' ') {
14840 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14841 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14842 writer->pos += 2;
14843 pindex += 2;
14844 }
14845 arg->width -= 2;
14846 if (arg->width < 0)
14847 arg->width = 0;
14848 len -= 2;
14849 }
14850
14851 /* Pad left with the fill character if needed */
14852 if (arg->width > len && !(arg->flags & F_LJUST)) {
14853 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014854 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014855 writer->pos += sublen;
14856 arg->width = len;
14857 }
14858
14859 /* If padding with spaces: write sign if needed and/or numeric prefix if
14860 the alternate form is used */
14861 if (fill == ' ') {
14862 if (arg->sign) {
14863 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14864 writer->pos += 1;
14865 }
14866 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14867 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14868 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14869 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14870 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14871 writer->pos += 2;
14872 pindex += 2;
14873 }
14874 }
14875
14876 /* Write characters */
14877 if (len) {
14878 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14879 str, pindex, len);
14880 writer->pos += len;
14881 }
14882
14883 /* Pad right with the fill character if needed */
14884 if (arg->width > len) {
14885 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014886 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014887 writer->pos += sublen;
14888 }
14889 return 0;
14890}
14891
14892/* Helper of PyUnicode_Format(): format one arg.
14893 Return 0 on success, raise an exception and return -1 on error. */
14894static int
14895unicode_format_arg(struct unicode_formatter_t *ctx)
14896{
14897 struct unicode_format_arg_t arg;
14898 PyObject *str;
14899 int ret;
14900
Victor Stinner8dbd4212012-12-04 09:30:24 +010014901 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014902 if (arg.ch == '%') {
14903 ctx->fmtpos++;
14904 ctx->fmtcnt--;
14905 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14906 return -1;
14907 return 0;
14908 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014909 arg.flags = 0;
14910 arg.width = -1;
14911 arg.prec = -1;
14912 arg.sign = 0;
14913 str = NULL;
14914
Victor Stinnera47082312012-10-04 02:19:54 +020014915 ret = unicode_format_arg_parse(ctx, &arg);
14916 if (ret == -1)
14917 return -1;
14918
14919 ret = unicode_format_arg_format(ctx, &arg, &str);
14920 if (ret == -1)
14921 return -1;
14922
14923 if (ret != 1) {
14924 ret = unicode_format_arg_output(ctx, &arg, str);
14925 Py_DECREF(str);
14926 if (ret == -1)
14927 return -1;
14928 }
14929
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014930 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014931 PyErr_SetString(PyExc_TypeError,
14932 "not all arguments converted during string formatting");
14933 return -1;
14934 }
14935 return 0;
14936}
14937
Alexander Belopolsky40018472011-02-26 01:02:56 +000014938PyObject *
14939PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940{
Victor Stinnera47082312012-10-04 02:19:54 +020014941 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014942
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014944 PyErr_BadInternalCall();
14945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946 }
Victor Stinnera47082312012-10-04 02:19:54 +020014947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014948 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014950
14951 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014952 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14953 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14954 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14955 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014956
Victor Stinner8f674cc2013-04-17 23:02:17 +020014957 _PyUnicodeWriter_Init(&ctx.writer);
14958 ctx.writer.min_length = ctx.fmtcnt + 100;
14959 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014960
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014962 ctx.arglen = PyTuple_Size(args);
14963 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964 }
14965 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.arglen = -1;
14967 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968 }
Victor Stinnera47082312012-10-04 02:19:54 +020014969 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014970 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014971 ctx.dict = args;
14972 else
14973 ctx.dict = NULL;
14974 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975
Victor Stinnera47082312012-10-04 02:19:54 +020014976 while (--ctx.fmtcnt >= 0) {
14977 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014978 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014979
14980 nonfmtpos = ctx.fmtpos++;
14981 while (ctx.fmtcnt >= 0 &&
14982 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14983 ctx.fmtpos++;
14984 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014985 }
Victor Stinnera47082312012-10-04 02:19:54 +020014986 if (ctx.fmtcnt < 0) {
14987 ctx.fmtpos--;
14988 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014989 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014990
Victor Stinnercfc4c132013-04-03 01:48:39 +020014991 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14992 nonfmtpos, ctx.fmtpos) < 0)
14993 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014994 }
14995 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014996 ctx.fmtpos++;
14997 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014998 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014999 }
15000 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015001
Victor Stinnera47082312012-10-04 02:19:54 +020015002 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 PyErr_SetString(PyExc_TypeError,
15004 "not all arguments converted during string formatting");
15005 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015006 }
15007
Victor Stinnera47082312012-10-04 02:19:54 +020015008 if (ctx.args_owned) {
15009 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015010 }
Victor Stinnera47082312012-10-04 02:19:54 +020015011 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015012
Benjamin Peterson29060642009-01-31 22:14:21 +000015013 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015014 _PyUnicodeWriter_Dealloc(&ctx.writer);
15015 if (ctx.args_owned) {
15016 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017 }
15018 return NULL;
15019}
15020
Jeremy Hylton938ace62002-07-17 16:30:39 +000015021static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015022unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15023
Tim Peters6d6c1a32001-08-02 04:15:00 +000015024static PyObject *
15025unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15026{
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 static char *kwlist[] = {"object", "encoding", "errors", 0};
15029 char *encoding = NULL;
15030 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015031
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 if (type != &PyUnicode_Type)
15033 return unicode_subtype_new(type, args, kwds);
15034 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015035 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 return NULL;
15037 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015038 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 if (encoding == NULL && errors == NULL)
15040 return PyObject_Str(x);
15041 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015042 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015043}
15044
Guido van Rossume023fe02001-08-30 03:12:59 +000015045static PyObject *
15046unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15047{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015048 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 Py_ssize_t length, char_size;
15050 int share_wstr, share_utf8;
15051 unsigned int kind;
15052 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015053
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015056 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015058 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015059 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015060 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015061 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015063 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015065 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066 if (self == NULL) {
15067 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 return NULL;
15069 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070 kind = PyUnicode_KIND(unicode);
15071 length = PyUnicode_GET_LENGTH(unicode);
15072
15073 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015074#ifdef Py_DEBUG
15075 _PyUnicode_HASH(self) = -1;
15076#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015078#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 _PyUnicode_STATE(self).interned = 0;
15080 _PyUnicode_STATE(self).kind = kind;
15081 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015082 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 _PyUnicode_STATE(self).ready = 1;
15084 _PyUnicode_WSTR(self) = NULL;
15085 _PyUnicode_UTF8_LENGTH(self) = 0;
15086 _PyUnicode_UTF8(self) = NULL;
15087 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015088 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089
15090 share_utf8 = 0;
15091 share_wstr = 0;
15092 if (kind == PyUnicode_1BYTE_KIND) {
15093 char_size = 1;
15094 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15095 share_utf8 = 1;
15096 }
15097 else if (kind == PyUnicode_2BYTE_KIND) {
15098 char_size = 2;
15099 if (sizeof(wchar_t) == 2)
15100 share_wstr = 1;
15101 }
15102 else {
15103 assert(kind == PyUnicode_4BYTE_KIND);
15104 char_size = 4;
15105 if (sizeof(wchar_t) == 4)
15106 share_wstr = 1;
15107 }
15108
15109 /* Ensure we won't overflow the length. */
15110 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15111 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015114 data = PyObject_MALLOC((length + 1) * char_size);
15115 if (data == NULL) {
15116 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015117 goto onError;
15118 }
15119
Victor Stinnerc3c74152011-10-02 20:39:55 +020015120 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015121 if (share_utf8) {
15122 _PyUnicode_UTF8_LENGTH(self) = length;
15123 _PyUnicode_UTF8(self) = data;
15124 }
15125 if (share_wstr) {
15126 _PyUnicode_WSTR_LENGTH(self) = length;
15127 _PyUnicode_WSTR(self) = (wchar_t *)data;
15128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129
Christian Heimesf051e432016-09-13 20:22:02 +020015130 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015131 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015132 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015133#ifdef Py_DEBUG
15134 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15135#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015136 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015137 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015138
15139onError:
15140 Py_DECREF(unicode);
15141 Py_DECREF(self);
15142 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015143}
15144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015145PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015146"str(object='') -> str\n\
15147str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015148\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015149Create a new string object from the given object. If encoding or\n\
15150errors is specified, then the object must expose a data buffer\n\
15151that will be decoded using the given encoding and error handler.\n\
15152Otherwise, returns the result of object.__str__() (if defined)\n\
15153or repr(object).\n\
15154encoding defaults to sys.getdefaultencoding().\n\
15155errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015156
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015157static PyObject *unicode_iter(PyObject *seq);
15158
Guido van Rossumd57fd912000-03-10 22:53:23 +000015159PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015160 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015161 "str", /* tp_name */
15162 sizeof(PyUnicodeObject), /* tp_basicsize */
15163 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015165 (destructor)unicode_dealloc, /* tp_dealloc */
15166 0, /* tp_print */
15167 0, /* tp_getattr */
15168 0, /* tp_setattr */
15169 0, /* tp_reserved */
15170 unicode_repr, /* tp_repr */
15171 &unicode_as_number, /* tp_as_number */
15172 &unicode_as_sequence, /* tp_as_sequence */
15173 &unicode_as_mapping, /* tp_as_mapping */
15174 (hashfunc) unicode_hash, /* tp_hash*/
15175 0, /* tp_call*/
15176 (reprfunc) unicode_str, /* tp_str */
15177 PyObject_GenericGetAttr, /* tp_getattro */
15178 0, /* tp_setattro */
15179 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015181 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15182 unicode_doc, /* tp_doc */
15183 0, /* tp_traverse */
15184 0, /* tp_clear */
15185 PyUnicode_RichCompare, /* tp_richcompare */
15186 0, /* tp_weaklistoffset */
15187 unicode_iter, /* tp_iter */
15188 0, /* tp_iternext */
15189 unicode_methods, /* tp_methods */
15190 0, /* tp_members */
15191 0, /* tp_getset */
15192 &PyBaseObject_Type, /* tp_base */
15193 0, /* tp_dict */
15194 0, /* tp_descr_get */
15195 0, /* tp_descr_set */
15196 0, /* tp_dictoffset */
15197 0, /* tp_init */
15198 0, /* tp_alloc */
15199 unicode_new, /* tp_new */
15200 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015201};
15202
15203/* Initialize the Unicode implementation */
15204
Victor Stinner3a50e702011-10-18 21:21:00 +020015205int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015206{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015207 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015209 0x000A, /* LINE FEED */
15210 0x000D, /* CARRIAGE RETURN */
15211 0x001C, /* FILE SEPARATOR */
15212 0x001D, /* GROUP SEPARATOR */
15213 0x001E, /* RECORD SEPARATOR */
15214 0x0085, /* NEXT LINE */
15215 0x2028, /* LINE SEPARATOR */
15216 0x2029, /* PARAGRAPH SEPARATOR */
15217 };
15218
Fred Drakee4315f52000-05-09 19:53:39 +000015219 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015220 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015221 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015222 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015223 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015224
Guido van Rossumcacfc072002-05-24 19:01:59 +000015225 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015226 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015227
15228 /* initialize the linebreak bloom filter */
15229 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015230 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015231 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015232
Christian Heimes26532f72013-07-20 14:57:16 +020015233 if (PyType_Ready(&EncodingMapType) < 0)
15234 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015235
Benjamin Petersonc4311282012-10-30 23:21:10 -040015236 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15237 Py_FatalError("Can't initialize field name iterator type");
15238
15239 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15240 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015241
Victor Stinner3a50e702011-10-18 21:21:00 +020015242 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015243}
15244
15245/* Finalize the Unicode implementation */
15246
Christian Heimesa156e092008-02-16 07:38:31 +000015247int
15248PyUnicode_ClearFreeList(void)
15249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015250 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015251}
15252
Guido van Rossumd57fd912000-03-10 22:53:23 +000015253void
Thomas Wouters78890102000-07-22 19:25:51 +000015254_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015256 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015257
Serhiy Storchaka05997252013-01-26 12:14:02 +020015258 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015259
Serhiy Storchaka05997252013-01-26 12:14:02 +020015260 for (i = 0; i < 256; i++)
15261 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015262 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015263 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015264}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015265
Walter Dörwald16807132007-05-25 13:52:07 +000015266void
15267PyUnicode_InternInPlace(PyObject **p)
15268{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015269 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015271#ifdef Py_DEBUG
15272 assert(s != NULL);
15273 assert(_PyUnicode_CHECK(s));
15274#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015276 return;
15277#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 /* If it's a subclass, we don't really know what putting
15279 it in the interned dict might do. */
15280 if (!PyUnicode_CheckExact(s))
15281 return;
15282 if (PyUnicode_CHECK_INTERNED(s))
15283 return;
15284 if (interned == NULL) {
15285 interned = PyDict_New();
15286 if (interned == NULL) {
15287 PyErr_Clear(); /* Don't leave an exception */
15288 return;
15289 }
15290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015292 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015294 if (t == NULL) {
15295 PyErr_Clear();
15296 return;
15297 }
15298 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015299 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015300 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015301 return;
15302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 /* The two references in interned are not counted by refcnt.
15304 The deallocator will take care of this */
15305 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015307}
15308
15309void
15310PyUnicode_InternImmortal(PyObject **p)
15311{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 PyUnicode_InternInPlace(p);
15313 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015314 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 Py_INCREF(*p);
15316 }
Walter Dörwald16807132007-05-25 13:52:07 +000015317}
15318
15319PyObject *
15320PyUnicode_InternFromString(const char *cp)
15321{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 PyObject *s = PyUnicode_FromString(cp);
15323 if (s == NULL)
15324 return NULL;
15325 PyUnicode_InternInPlace(&s);
15326 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015327}
15328
Alexander Belopolsky40018472011-02-26 01:02:56 +000015329void
15330_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015333 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 Py_ssize_t i, n;
15335 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015336
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 if (interned == NULL || !PyDict_Check(interned))
15338 return;
15339 keys = PyDict_Keys(interned);
15340 if (keys == NULL || !PyList_Check(keys)) {
15341 PyErr_Clear();
15342 return;
15343 }
Walter Dörwald16807132007-05-25 13:52:07 +000015344
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15346 detector, interned unicode strings are not forcibly deallocated;
15347 rather, we give them their stolen references back, and then clear
15348 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015349
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 n = PyList_GET_SIZE(keys);
15351 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015352 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015354 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015355 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015356 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015358 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 case SSTATE_NOT_INTERNED:
15360 /* XXX Shouldn't happen */
15361 break;
15362 case SSTATE_INTERNED_IMMORTAL:
15363 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015364 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 break;
15366 case SSTATE_INTERNED_MORTAL:
15367 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015368 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 break;
15370 default:
15371 Py_FatalError("Inconsistent interned string state.");
15372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015373 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 }
15375 fprintf(stderr, "total size of all interned strings: "
15376 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15377 "mortal/immortal\n", mortal_size, immortal_size);
15378 Py_DECREF(keys);
15379 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015380 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015381}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015382
15383
15384/********************* Unicode Iterator **************************/
15385
15386typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 PyObject_HEAD
15388 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015389 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015390} unicodeiterobject;
15391
15392static void
15393unicodeiter_dealloc(unicodeiterobject *it)
15394{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015395 _PyObject_GC_UNTRACK(it);
15396 Py_XDECREF(it->it_seq);
15397 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015398}
15399
15400static int
15401unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15402{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 Py_VISIT(it->it_seq);
15404 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405}
15406
15407static PyObject *
15408unicodeiter_next(unicodeiterobject *it)
15409{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015410 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015411
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 assert(it != NULL);
15413 seq = it->it_seq;
15414 if (seq == NULL)
15415 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015416 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015418 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15419 int kind = PyUnicode_KIND(seq);
15420 void *data = PyUnicode_DATA(seq);
15421 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15422 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 if (item != NULL)
15424 ++it->it_index;
15425 return item;
15426 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015429 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015431}
15432
15433static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015434unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015435{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 Py_ssize_t len = 0;
15437 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015438 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015440}
15441
15442PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15443
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015444static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015445unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015446{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015447 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015449 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015450 it->it_seq, it->it_index);
15451 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015452 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015453 if (u == NULL)
15454 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015455 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015456 }
15457}
15458
15459PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15460
15461static PyObject *
15462unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15463{
15464 Py_ssize_t index = PyLong_AsSsize_t(state);
15465 if (index == -1 && PyErr_Occurred())
15466 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015467 if (it->it_seq != NULL) {
15468 if (index < 0)
15469 index = 0;
15470 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15471 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15472 it->it_index = index;
15473 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015474 Py_RETURN_NONE;
15475}
15476
15477PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15478
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015479static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015480 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015481 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015482 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15483 reduce_doc},
15484 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15485 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015486 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015487};
15488
15489PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15491 "str_iterator", /* tp_name */
15492 sizeof(unicodeiterobject), /* tp_basicsize */
15493 0, /* tp_itemsize */
15494 /* methods */
15495 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15496 0, /* tp_print */
15497 0, /* tp_getattr */
15498 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015499 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 0, /* tp_repr */
15501 0, /* tp_as_number */
15502 0, /* tp_as_sequence */
15503 0, /* tp_as_mapping */
15504 0, /* tp_hash */
15505 0, /* tp_call */
15506 0, /* tp_str */
15507 PyObject_GenericGetAttr, /* tp_getattro */
15508 0, /* tp_setattro */
15509 0, /* tp_as_buffer */
15510 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15511 0, /* tp_doc */
15512 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15513 0, /* tp_clear */
15514 0, /* tp_richcompare */
15515 0, /* tp_weaklistoffset */
15516 PyObject_SelfIter, /* tp_iter */
15517 (iternextfunc)unicodeiter_next, /* tp_iternext */
15518 unicodeiter_methods, /* tp_methods */
15519 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015520};
15521
15522static PyObject *
15523unicode_iter(PyObject *seq)
15524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015526
Benjamin Peterson14339b62009-01-31 16:36:08 +000015527 if (!PyUnicode_Check(seq)) {
15528 PyErr_BadInternalCall();
15529 return NULL;
15530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015531 if (PyUnicode_READY(seq) == -1)
15532 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015533 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15534 if (it == NULL)
15535 return NULL;
15536 it->it_index = 0;
15537 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015538 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 _PyObject_GC_TRACK(it);
15540 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015541}
15542
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015543
15544size_t
15545Py_UNICODE_strlen(const Py_UNICODE *u)
15546{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015547 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015548}
15549
15550Py_UNICODE*
15551Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15552{
15553 Py_UNICODE *u = s1;
15554 while ((*u++ = *s2++));
15555 return s1;
15556}
15557
15558Py_UNICODE*
15559Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15560{
15561 Py_UNICODE *u = s1;
15562 while ((*u++ = *s2++))
15563 if (n-- == 0)
15564 break;
15565 return s1;
15566}
15567
15568Py_UNICODE*
15569Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15570{
15571 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015572 u1 += wcslen(u1);
15573 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015574 return s1;
15575}
15576
15577int
15578Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15579{
15580 while (*s1 && *s2 && *s1 == *s2)
15581 s1++, s2++;
15582 if (*s1 && *s2)
15583 return (*s1 < *s2) ? -1 : +1;
15584 if (*s1)
15585 return 1;
15586 if (*s2)
15587 return -1;
15588 return 0;
15589}
15590
15591int
15592Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15593{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015594 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015595 for (; n != 0; n--) {
15596 u1 = *s1;
15597 u2 = *s2;
15598 if (u1 != u2)
15599 return (u1 < u2) ? -1 : +1;
15600 if (u1 == '\0')
15601 return 0;
15602 s1++;
15603 s2++;
15604 }
15605 return 0;
15606}
15607
15608Py_UNICODE*
15609Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15610{
15611 const Py_UNICODE *p;
15612 for (p = s; *p; p++)
15613 if (*p == c)
15614 return (Py_UNICODE*)p;
15615 return NULL;
15616}
15617
15618Py_UNICODE*
15619Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15620{
15621 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015622 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015623 while (p != s) {
15624 p--;
15625 if (*p == c)
15626 return (Py_UNICODE*)p;
15627 }
15628 return NULL;
15629}
Victor Stinner331ea922010-08-10 16:37:20 +000015630
Victor Stinner71133ff2010-09-01 23:43:53 +000015631Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015632PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015633{
Victor Stinner577db2c2011-10-11 22:12:48 +020015634 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015635 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015637 if (!PyUnicode_Check(unicode)) {
15638 PyErr_BadArgument();
15639 return NULL;
15640 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015641 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015642 if (u == NULL)
15643 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015644 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015645 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015646 PyErr_NoMemory();
15647 return NULL;
15648 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015649 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015650 size *= sizeof(Py_UNICODE);
15651 copy = PyMem_Malloc(size);
15652 if (copy == NULL) {
15653 PyErr_NoMemory();
15654 return NULL;
15655 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015656 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015657 return copy;
15658}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015659
Georg Brandl66c221e2010-10-14 07:04:07 +000015660/* A _string module, to export formatter_parser and formatter_field_name_split
15661 to the string.Formatter class implemented in Python. */
15662
15663static PyMethodDef _string_methods[] = {
15664 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15665 METH_O, PyDoc_STR("split the argument as a field name")},
15666 {"formatter_parser", (PyCFunction) formatter_parser,
15667 METH_O, PyDoc_STR("parse the argument as a format string")},
15668 {NULL, NULL}
15669};
15670
15671static struct PyModuleDef _string_module = {
15672 PyModuleDef_HEAD_INIT,
15673 "_string",
15674 PyDoc_STR("string helper module"),
15675 0,
15676 _string_methods,
15677 NULL,
15678 NULL,
15679 NULL,
15680 NULL
15681};
15682
15683PyMODINIT_FUNC
15684PyInit__string(void)
15685{
15686 return PyModule_Create(&_string_module);
15687}
15688
15689
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015690#ifdef __cplusplus
15691}
15692#endif