blob: 8dd7c3b8258c3c407ef91ba55b765725ccf30412 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100221#define FILL(kind, data, value, start, length) \
222 do { \
223 Py_ssize_t i_ = 0; \
224 assert(kind != PyUnicode_WCHAR_KIND); \
225 switch ((kind)) { \
226 case PyUnicode_1BYTE_KIND: { \
227 unsigned char * to_ = (unsigned char *)((data)) + (start); \
228 memset(to_, (unsigned char)value, (length)); \
229 break; \
230 } \
231 case PyUnicode_2BYTE_KIND: { \
232 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
233 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
234 break; \
235 } \
236 case PyUnicode_4BYTE_KIND: { \
237 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
238 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
239 break; \
240 } \
241 default: Py_UNREACHABLE(); \
242 } \
243 } while (0)
244
245
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200246/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700247static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200248_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
249
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200250/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200251static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253/* Single character Unicode strings in the Latin-1 range are being
254 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200255static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Fast detection of the most frequent whitespace characters */
258const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000262/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* case 0x000C: * FORM FEED */
264/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 1, 1, 1, 1, 1, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* case 0x001C: * FILE SEPARATOR */
268/* case 0x001D: * GROUP SEPARATOR */
269/* case 0x001E: * RECORD SEPARATOR */
270/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 1, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000277
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000286};
287
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200288/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200289static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200290static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100291static int unicode_modifiable(PyObject *unicode);
292
Victor Stinnerfe226c02011-10-03 03:52:20 +0200293
Alexander Belopolsky40018472011-02-26 01:02:56 +0000294static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100295_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200296static PyObject *
297_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
298static PyObject *
299_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
300
301static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000302unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000303 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100304 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000305 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
306
Alexander Belopolsky40018472011-02-26 01:02:56 +0000307static void
308raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300309 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100310 PyObject *unicode,
311 Py_ssize_t startpos, Py_ssize_t endpos,
312 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000313
Christian Heimes190d79e2008-01-30 11:58:22 +0000314/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200315static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000318/* 0x000B, * LINE TABULATION */
319/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000320/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000321 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000322 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000323/* 0x001C, * FILE SEPARATOR */
324/* 0x001D, * GROUP SEPARATOR */
325/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000326 0, 0, 0, 0, 1, 1, 1, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000340};
341
INADA Naoki3ae20562017-01-16 20:41:20 +0900342static int convert_uc(PyObject *obj, void *addr);
343
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300344#include "clinic/unicodeobject.c.h"
345
Victor Stinner50149202015-09-22 00:26:54 +0200346typedef enum {
347 _Py_ERROR_UNKNOWN=0,
348 _Py_ERROR_STRICT,
349 _Py_ERROR_SURROGATEESCAPE,
350 _Py_ERROR_REPLACE,
351 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 _Py_ERROR_BACKSLASHREPLACE,
353 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200354 _Py_ERROR_XMLCHARREFREPLACE,
355 _Py_ERROR_OTHER
356} _Py_error_handler;
357
358static _Py_error_handler
359get_error_handler(const char *errors)
360{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200361 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200362 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200363 }
364 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200365 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200366 }
367 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200371 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200374 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_OTHER;
383}
384
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300385/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
386 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000387Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000388PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000389{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000390#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000391 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000392#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000393 /* This is actually an illegal character, so it should
394 not be passed to unichr. */
395 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000396#endif
397}
398
Victor Stinner910337b2011-10-03 03:20:16 +0200399#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200400int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100401_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200402{
403 PyASCIIObject *ascii;
404 unsigned int kind;
405
406 assert(PyUnicode_Check(op));
407
408 ascii = (PyASCIIObject *)op;
409 kind = ascii->state.kind;
410
Victor Stinnera3b334d2011-10-03 13:53:37 +0200411 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200412 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200413 assert(ascii->state.ready == 1);
414 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200415 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200416 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200418
Victor Stinnera41463c2011-10-04 01:05:08 +0200419 if (ascii->state.compact == 1) {
420 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200421 assert(kind == PyUnicode_1BYTE_KIND
422 || kind == PyUnicode_2BYTE_KIND
423 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200425 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200426 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100427 }
428 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200429 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
430
431 data = unicode->data.any;
432 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100433 assert(ascii->length == 0);
434 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200435 assert(ascii->state.compact == 0);
436 assert(ascii->state.ascii == 0);
437 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100438 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200439 assert(ascii->wstr != NULL);
440 assert(data == NULL);
441 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200442 }
443 else {
444 assert(kind == PyUnicode_1BYTE_KIND
445 || kind == PyUnicode_2BYTE_KIND
446 || kind == PyUnicode_4BYTE_KIND);
447 assert(ascii->state.compact == 0);
448 assert(ascii->state.ready == 1);
449 assert(data != NULL);
450 if (ascii->state.ascii) {
451 assert (compact->utf8 == data);
452 assert (compact->utf8_length == ascii->length);
453 }
454 else
455 assert (compact->utf8 != data);
456 }
457 }
458 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200459 if (
460#if SIZEOF_WCHAR_T == 2
461 kind == PyUnicode_2BYTE_KIND
462#else
463 kind == PyUnicode_4BYTE_KIND
464#endif
465 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200466 {
467 assert(ascii->wstr == data);
468 assert(compact->wstr_length == ascii->length);
469 } else
470 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200471 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200472
473 if (compact->utf8 == NULL)
474 assert(compact->utf8_length == 0);
475 if (ascii->wstr == NULL)
476 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200477 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200478 /* check that the best kind is used */
479 if (check_content && kind != PyUnicode_WCHAR_KIND)
480 {
481 Py_ssize_t i;
482 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 void *data;
484 Py_UCS4 ch;
485
486 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200487 for (i=0; i < ascii->length; i++)
488 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200489 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200490 if (ch > maxchar)
491 maxchar = ch;
492 }
493 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100494 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200495 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100496 assert(maxchar <= 255);
497 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200498 else
499 assert(maxchar < 128);
500 }
Victor Stinner77faf692011-11-20 18:56:05 +0100501 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200502 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100503 assert(maxchar <= 0xFFFF);
504 }
505 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200506 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100507 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100508 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200509 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200510 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400511 return 1;
512}
Victor Stinner910337b2011-10-03 03:20:16 +0200513#endif
514
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100515static PyObject*
516unicode_result_wchar(PyObject *unicode)
517{
518#ifndef Py_DEBUG
519 Py_ssize_t len;
520
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100521 len = _PyUnicode_WSTR_LENGTH(unicode);
522 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200524 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 }
526
527 if (len == 1) {
528 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100529 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100530 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
531 Py_DECREF(unicode);
532 return latin1_char;
533 }
534 }
535
536 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200537 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100538 return NULL;
539 }
540#else
Victor Stinneraa771272012-10-04 02:32:58 +0200541 assert(Py_REFCNT(unicode) == 1);
542
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 /* don't make the result ready in debug mode to ensure that the caller
544 makes the string ready before using it */
545 assert(_PyUnicode_CheckConsistency(unicode, 1));
546#endif
547 return unicode;
548}
549
550static PyObject*
551unicode_result_ready(PyObject *unicode)
552{
553 Py_ssize_t length;
554
555 length = PyUnicode_GET_LENGTH(unicode);
556 if (length == 0) {
557 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100558 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200559 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100560 }
561 return unicode_empty;
562 }
563
564 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200565 void *data = PyUnicode_DATA(unicode);
566 int kind = PyUnicode_KIND(unicode);
567 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100568 if (ch < 256) {
569 PyObject *latin1_char = unicode_latin1[ch];
570 if (latin1_char != NULL) {
571 if (unicode != latin1_char) {
572 Py_INCREF(latin1_char);
573 Py_DECREF(unicode);
574 }
575 return latin1_char;
576 }
577 else {
578 assert(_PyUnicode_CheckConsistency(unicode, 1));
579 Py_INCREF(unicode);
580 unicode_latin1[ch] = unicode;
581 return unicode;
582 }
583 }
584 }
585
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 return unicode;
588}
589
590static PyObject*
591unicode_result(PyObject *unicode)
592{
593 assert(_PyUnicode_CHECK(unicode));
594 if (PyUnicode_IS_READY(unicode))
595 return unicode_result_ready(unicode);
596 else
597 return unicode_result_wchar(unicode);
598}
599
Victor Stinnerc4b49542011-12-11 22:44:26 +0100600static PyObject*
601unicode_result_unchanged(PyObject *unicode)
602{
603 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500604 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100605 return NULL;
606 Py_INCREF(unicode);
607 return unicode;
608 }
609 else
610 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100611 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100612}
613
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200614/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
615 ASCII, Latin1, UTF-8, etc. */
616static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200617backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
619{
Victor Stinnerad771582015-10-09 12:38:53 +0200620 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200621 Py_UCS4 ch;
622 enum PyUnicode_Kind kind;
623 void *data;
624
625 assert(PyUnicode_IS_READY(unicode));
626 kind = PyUnicode_KIND(unicode);
627 data = PyUnicode_DATA(unicode);
628
629 size = 0;
630 /* determine replacement size */
631 for (i = collstart; i < collend; ++i) {
632 Py_ssize_t incr;
633
634 ch = PyUnicode_READ(kind, data, i);
635 if (ch < 0x100)
636 incr = 2+2;
637 else if (ch < 0x10000)
638 incr = 2+4;
639 else {
640 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200641 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200642 }
643 if (size > PY_SSIZE_T_MAX - incr) {
644 PyErr_SetString(PyExc_OverflowError,
645 "encoded result is too long for a Python string");
646 return NULL;
647 }
648 size += incr;
649 }
650
Victor Stinnerad771582015-10-09 12:38:53 +0200651 str = _PyBytesWriter_Prepare(writer, str, size);
652 if (str == NULL)
653 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200654
655 /* generate replacement */
656 for (i = collstart; i < collend; ++i) {
657 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200658 *str++ = '\\';
659 if (ch >= 0x00010000) {
660 *str++ = 'U';
661 *str++ = Py_hexdigits[(ch>>28)&0xf];
662 *str++ = Py_hexdigits[(ch>>24)&0xf];
663 *str++ = Py_hexdigits[(ch>>20)&0xf];
664 *str++ = Py_hexdigits[(ch>>16)&0xf];
665 *str++ = Py_hexdigits[(ch>>12)&0xf];
666 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200667 }
Victor Stinner797485e2015-10-09 03:17:30 +0200668 else if (ch >= 0x100) {
669 *str++ = 'u';
670 *str++ = Py_hexdigits[(ch>>12)&0xf];
671 *str++ = Py_hexdigits[(ch>>8)&0xf];
672 }
673 else
674 *str++ = 'x';
675 *str++ = Py_hexdigits[(ch>>4)&0xf];
676 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200677 }
678 return str;
679}
680
681/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
682 ASCII, Latin1, UTF-8, etc. */
683static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200684xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200685 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
686{
Victor Stinnerad771582015-10-09 12:38:53 +0200687 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200688 Py_UCS4 ch;
689 enum PyUnicode_Kind kind;
690 void *data;
691
692 assert(PyUnicode_IS_READY(unicode));
693 kind = PyUnicode_KIND(unicode);
694 data = PyUnicode_DATA(unicode);
695
696 size = 0;
697 /* determine replacement size */
698 for (i = collstart; i < collend; ++i) {
699 Py_ssize_t incr;
700
701 ch = PyUnicode_READ(kind, data, i);
702 if (ch < 10)
703 incr = 2+1+1;
704 else if (ch < 100)
705 incr = 2+2+1;
706 else if (ch < 1000)
707 incr = 2+3+1;
708 else if (ch < 10000)
709 incr = 2+4+1;
710 else if (ch < 100000)
711 incr = 2+5+1;
712 else if (ch < 1000000)
713 incr = 2+6+1;
714 else {
715 assert(ch <= MAX_UNICODE);
716 incr = 2+7+1;
717 }
718 if (size > PY_SSIZE_T_MAX - incr) {
719 PyErr_SetString(PyExc_OverflowError,
720 "encoded result is too long for a Python string");
721 return NULL;
722 }
723 size += incr;
724 }
725
Victor Stinnerad771582015-10-09 12:38:53 +0200726 str = _PyBytesWriter_Prepare(writer, str, size);
727 if (str == NULL)
728 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200729
730 /* generate replacement */
731 for (i = collstart; i < collend; ++i) {
732 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
733 }
734 return str;
735}
736
Thomas Wouters477c8d52006-05-27 19:21:47 +0000737/* --- Bloom Filters ----------------------------------------------------- */
738
739/* stuff to implement simple "bloom filters" for Unicode characters.
740 to keep things simple, we use a single bitmask, using the least 5
741 bits from each unicode characters as the bit index. */
742
743/* the linebreak mask is set up by Unicode_Init below */
744
Antoine Pitrouf068f942010-01-13 14:19:12 +0000745#if LONG_BIT >= 128
746#define BLOOM_WIDTH 128
747#elif LONG_BIT >= 64
748#define BLOOM_WIDTH 64
749#elif LONG_BIT >= 32
750#define BLOOM_WIDTH 32
751#else
752#error "LONG_BIT is smaller than 32"
753#endif
754
Thomas Wouters477c8d52006-05-27 19:21:47 +0000755#define BLOOM_MASK unsigned long
756
Serhiy Storchaka05997252013-01-26 12:14:02 +0200757static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000758
Antoine Pitrouf068f942010-01-13 14:19:12 +0000759#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000760
Benjamin Peterson29060642009-01-31 22:14:21 +0000761#define BLOOM_LINEBREAK(ch) \
762 ((ch) < 128U ? ascii_linebreak[(ch)] : \
763 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700765static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200766make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000767{
Victor Stinnera85af502013-04-09 21:53:54 +0200768#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
769 do { \
770 TYPE *data = (TYPE *)PTR; \
771 TYPE *end = data + LEN; \
772 Py_UCS4 ch; \
773 for (; data != end; data++) { \
774 ch = *data; \
775 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
776 } \
777 break; \
778 } while (0)
779
Thomas Wouters477c8d52006-05-27 19:21:47 +0000780 /* calculate simple bloom-style bitmask for a given unicode string */
781
Antoine Pitrouf068f942010-01-13 14:19:12 +0000782 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000783
784 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200785 switch (kind) {
786 case PyUnicode_1BYTE_KIND:
787 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
788 break;
789 case PyUnicode_2BYTE_KIND:
790 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
791 break;
792 case PyUnicode_4BYTE_KIND:
793 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
794 break;
795 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700796 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200797 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000798 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200799
800#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000801}
802
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300803static int
804ensure_unicode(PyObject *obj)
805{
806 if (!PyUnicode_Check(obj)) {
807 PyErr_Format(PyExc_TypeError,
808 "must be str, not %.100s",
809 Py_TYPE(obj)->tp_name);
810 return -1;
811 }
812 return PyUnicode_READY(obj);
813}
814
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200815/* Compilation of templated routines */
816
817#include "stringlib/asciilib.h"
818#include "stringlib/fastsearch.h"
819#include "stringlib/partition.h"
820#include "stringlib/split.h"
821#include "stringlib/count.h"
822#include "stringlib/find.h"
823#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200824#include "stringlib/undef.h"
825
826#include "stringlib/ucs1lib.h"
827#include "stringlib/fastsearch.h"
828#include "stringlib/partition.h"
829#include "stringlib/split.h"
830#include "stringlib/count.h"
831#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300832#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200833#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200834#include "stringlib/undef.h"
835
836#include "stringlib/ucs2lib.h"
837#include "stringlib/fastsearch.h"
838#include "stringlib/partition.h"
839#include "stringlib/split.h"
840#include "stringlib/count.h"
841#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300842#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200843#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200844#include "stringlib/undef.h"
845
846#include "stringlib/ucs4lib.h"
847#include "stringlib/fastsearch.h"
848#include "stringlib/partition.h"
849#include "stringlib/split.h"
850#include "stringlib/count.h"
851#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300852#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200853#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200854#include "stringlib/undef.h"
855
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200856#include "stringlib/unicodedefs.h"
857#include "stringlib/fastsearch.h"
858#include "stringlib/count.h"
859#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100860#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200861
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862/* --- Unicode Object ----------------------------------------------------- */
863
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700864static inline Py_ssize_t
865findchar(const void *s, int kind,
866 Py_ssize_t size, Py_UCS4 ch,
867 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200869 switch (kind) {
870 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200871 if ((Py_UCS1) ch != ch)
872 return -1;
873 if (direction > 0)
874 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
875 else
876 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200877 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200878 if ((Py_UCS2) ch != ch)
879 return -1;
880 if (direction > 0)
881 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
882 else
883 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200884 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200885 if (direction > 0)
886 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
887 else
888 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200889 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700890 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892}
893
Victor Stinnerafffce42012-10-03 23:03:17 +0200894#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000895/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200896 earlier.
897
898 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
899 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
900 invalid character in Unicode 6.0. */
901static void
902unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
903{
904 int kind = PyUnicode_KIND(unicode);
905 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
906 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
907 if (length <= old_length)
908 return;
909 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
910}
911#endif
912
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913static PyObject*
914resize_compact(PyObject *unicode, Py_ssize_t length)
915{
916 Py_ssize_t char_size;
917 Py_ssize_t struct_size;
918 Py_ssize_t new_size;
919 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100920 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200921#ifdef Py_DEBUG
922 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
923#endif
924
Victor Stinner79891572012-05-03 13:43:07 +0200925 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100927 assert(PyUnicode_IS_COMPACT(unicode));
928
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200929 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100930 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 struct_size = sizeof(PyASCIIObject);
932 else
933 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200934 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935
Victor Stinnerfe226c02011-10-03 03:52:20 +0200936 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
937 PyErr_NoMemory();
938 return NULL;
939 }
940 new_size = (struct_size + (length + 1) * char_size);
941
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200942 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
943 PyObject_DEL(_PyUnicode_UTF8(unicode));
944 _PyUnicode_UTF8(unicode) = NULL;
945 _PyUnicode_UTF8_LENGTH(unicode) = 0;
946 }
Victor Stinner84def372011-12-11 20:04:56 +0100947 _Py_DEC_REFTOTAL;
948 _Py_ForgetReference(unicode);
949
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300950 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100951 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100952 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 PyErr_NoMemory();
954 return NULL;
955 }
Victor Stinner84def372011-12-11 20:04:56 +0100956 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100958
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200960 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100962 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 _PyUnicode_WSTR_LENGTH(unicode) = length;
964 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100965 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
966 PyObject_DEL(_PyUnicode_WSTR(unicode));
967 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100968 if (!PyUnicode_IS_ASCII(unicode))
969 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100970 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200971#ifdef Py_DEBUG
972 unicode_fill_invalid(unicode, old_length);
973#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200974 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
975 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200976 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977 return unicode;
978}
979
Alexander Belopolsky40018472011-02-26 01:02:56 +0000980static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200981resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982{
Victor Stinner95663112011-10-04 01:03:50 +0200983 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100984 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000987
Victor Stinnerfe226c02011-10-03 03:52:20 +0200988 if (PyUnicode_IS_READY(unicode)) {
989 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200991 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200992#ifdef Py_DEBUG
993 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
994#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995
996 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200997 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200998 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
999 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000
1001 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1002 PyErr_NoMemory();
1003 return -1;
1004 }
1005 new_size = (length + 1) * char_size;
1006
Victor Stinner7a9105a2011-12-12 00:13:42 +01001007 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1008 {
1009 PyObject_DEL(_PyUnicode_UTF8(unicode));
1010 _PyUnicode_UTF8(unicode) = NULL;
1011 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1012 }
1013
Victor Stinnerfe226c02011-10-03 03:52:20 +02001014 data = (PyObject *)PyObject_REALLOC(data, new_size);
1015 if (data == NULL) {
1016 PyErr_NoMemory();
1017 return -1;
1018 }
1019 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001020 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001022 _PyUnicode_WSTR_LENGTH(unicode) = length;
1023 }
1024 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001025 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 _PyUnicode_UTF8_LENGTH(unicode) = length;
1027 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 _PyUnicode_LENGTH(unicode) = length;
1029 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001030#ifdef Py_DEBUG
1031 unicode_fill_invalid(unicode, old_length);
1032#endif
Victor Stinner95663112011-10-04 01:03:50 +02001033 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001034 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037 }
Victor Stinner95663112011-10-04 01:03:50 +02001038 assert(_PyUnicode_WSTR(unicode) != NULL);
1039
1040 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001041 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001042 PyErr_NoMemory();
1043 return -1;
1044 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001046 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001048 if (!wstr) {
1049 PyErr_NoMemory();
1050 return -1;
1051 }
1052 _PyUnicode_WSTR(unicode) = wstr;
1053 _PyUnicode_WSTR(unicode)[length] = 0;
1054 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001055 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 return 0;
1057}
1058
Victor Stinnerfe226c02011-10-03 03:52:20 +02001059static PyObject*
1060resize_copy(PyObject *unicode, Py_ssize_t length)
1061{
1062 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001063 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001064 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001065
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001066 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067
1068 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1069 if (copy == NULL)
1070 return NULL;
1071
1072 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001073 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001075 }
1076 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001077 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001078
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 if (w == NULL)
1081 return NULL;
1082 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1083 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001084 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001085 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001086 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 }
1088}
1089
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001091 Ux0000 terminated; some code (e.g. new_identifier)
1092 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093
1094 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001095 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096
1097*/
1098
Alexander Belopolsky40018472011-02-26 01:02:56 +00001099static PyUnicodeObject *
1100_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001102 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
Thomas Wouters477c8d52006-05-27 19:21:47 +00001105 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 if (length == 0 && unicode_empty != NULL) {
1107 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001108 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
1110
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001111 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001112 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001113 return (PyUnicodeObject *)PyErr_NoMemory();
1114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 if (length < 0) {
1116 PyErr_SetString(PyExc_SystemError,
1117 "Negative size passed to _PyUnicode_New");
1118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 }
1120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1122 if (unicode == NULL)
1123 return NULL;
1124 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001125
1126 _PyUnicode_WSTR_LENGTH(unicode) = length;
1127 _PyUnicode_HASH(unicode) = -1;
1128 _PyUnicode_STATE(unicode).interned = 0;
1129 _PyUnicode_STATE(unicode).kind = 0;
1130 _PyUnicode_STATE(unicode).compact = 0;
1131 _PyUnicode_STATE(unicode).ready = 0;
1132 _PyUnicode_STATE(unicode).ascii = 0;
1133 _PyUnicode_DATA_ANY(unicode) = NULL;
1134 _PyUnicode_LENGTH(unicode) = 0;
1135 _PyUnicode_UTF8(unicode) = NULL;
1136 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1139 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001140 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001141 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001142 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144
Jeremy Hyltond8082792003-09-16 19:41:39 +00001145 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001146 * the caller fails before initializing str -- unicode_resize()
1147 * reads str[0], and the Keep-Alive optimization can keep memory
1148 * allocated for str alive across a call to unicode_dealloc(unicode).
1149 * We don't want unicode_resize to read uninitialized memory in
1150 * that case.
1151 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152 _PyUnicode_WSTR(unicode)[0] = 0;
1153 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001154
Victor Stinner7931d9a2011-11-04 00:22:48 +01001155 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 return unicode;
1157}
1158
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159static const char*
1160unicode_kind_name(PyObject *unicode)
1161{
Victor Stinner42dfd712011-10-03 14:41:45 +02001162 /* don't check consistency: unicode_kind_name() is called from
1163 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001164 if (!PyUnicode_IS_COMPACT(unicode))
1165 {
1166 if (!PyUnicode_IS_READY(unicode))
1167 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001168 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 {
1170 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 return "legacy ascii";
1173 else
1174 return "legacy latin1";
1175 case PyUnicode_2BYTE_KIND:
1176 return "legacy UCS2";
1177 case PyUnicode_4BYTE_KIND:
1178 return "legacy UCS4";
1179 default:
1180 return "<legacy invalid kind>";
1181 }
1182 }
1183 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001184 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001185 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001186 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001187 return "ascii";
1188 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001189 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001190 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001191 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001192 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001193 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 default:
1195 return "<invalid compact kind>";
1196 }
1197}
1198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200/* Functions wrapping macros for use in debugger */
1201char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203}
1204
1205void *_PyUnicode_compact_data(void *unicode) {
1206 return _PyUnicode_COMPACT_DATA(unicode);
1207}
1208void *_PyUnicode_data(void *unicode){
1209 printf("obj %p\n", unicode);
1210 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1211 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1212 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1213 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1214 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1215 return PyUnicode_DATA(unicode);
1216}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217
1218void
1219_PyUnicode_Dump(PyObject *op)
1220{
1221 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001222 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1223 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1224 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001225
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001227 {
1228 if (ascii->state.ascii)
1229 data = (ascii + 1);
1230 else
1231 data = (compact + 1);
1232 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001233 else
1234 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001235 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1236 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001237
Victor Stinnera849a4b2011-10-03 12:12:11 +02001238 if (ascii->wstr == data)
1239 printf("shared ");
1240 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001241
Victor Stinnera3b334d2011-10-03 13:53:37 +02001242 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001243 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001244 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1245 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001246 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1247 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001248 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001249 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001250}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251#endif
1252
1253PyObject *
1254PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1255{
1256 PyObject *obj;
1257 PyCompactUnicodeObject *unicode;
1258 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001259 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001260 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001261 Py_ssize_t char_size;
1262 Py_ssize_t struct_size;
1263
1264 /* Optimization for empty strings */
1265 if (size == 0 && unicode_empty != NULL) {
1266 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001267 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 }
1269
Victor Stinner9e9d6892011-10-04 01:02:02 +02001270 is_ascii = 0;
1271 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 struct_size = sizeof(PyCompactUnicodeObject);
1273 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001274 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 char_size = 1;
1276 is_ascii = 1;
1277 struct_size = sizeof(PyASCIIObject);
1278 }
1279 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001280 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281 char_size = 1;
1282 }
1283 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001284 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 char_size = 2;
1286 if (sizeof(wchar_t) == 2)
1287 is_sharing = 1;
1288 }
1289 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001290 if (maxchar > MAX_UNICODE) {
1291 PyErr_SetString(PyExc_SystemError,
1292 "invalid maximum character passed to PyUnicode_New");
1293 return NULL;
1294 }
Victor Stinner8f825062012-04-27 13:55:39 +02001295 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 char_size = 4;
1297 if (sizeof(wchar_t) == 4)
1298 is_sharing = 1;
1299 }
1300
1301 /* Ensure we won't overflow the size. */
1302 if (size < 0) {
1303 PyErr_SetString(PyExc_SystemError,
1304 "Negative size passed to PyUnicode_New");
1305 return NULL;
1306 }
1307 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1308 return PyErr_NoMemory();
1309
1310 /* Duplicated allocation code from _PyObject_New() instead of a call to
1311 * PyObject_New() so we are able to allocate space for the object and
1312 * it's data buffer.
1313 */
1314 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1315 if (obj == NULL)
1316 return PyErr_NoMemory();
1317 obj = PyObject_INIT(obj, &PyUnicode_Type);
1318 if (obj == NULL)
1319 return NULL;
1320
1321 unicode = (PyCompactUnicodeObject *)obj;
1322 if (is_ascii)
1323 data = ((PyASCIIObject*)obj) + 1;
1324 else
1325 data = unicode + 1;
1326 _PyUnicode_LENGTH(unicode) = size;
1327 _PyUnicode_HASH(unicode) = -1;
1328 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 _PyUnicode_STATE(unicode).compact = 1;
1331 _PyUnicode_STATE(unicode).ready = 1;
1332 _PyUnicode_STATE(unicode).ascii = is_ascii;
1333 if (is_ascii) {
1334 ((char*)data)[size] = 0;
1335 _PyUnicode_WSTR(unicode) = NULL;
1336 }
Victor Stinner8f825062012-04-27 13:55:39 +02001337 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 ((char*)data)[size] = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001342 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 else {
1345 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001346 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001347 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001349 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 ((Py_UCS4*)data)[size] = 0;
1351 if (is_sharing) {
1352 _PyUnicode_WSTR_LENGTH(unicode) = size;
1353 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1354 }
1355 else {
1356 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1357 _PyUnicode_WSTR(unicode) = NULL;
1358 }
1359 }
Victor Stinner8f825062012-04-27 13:55:39 +02001360#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001361 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001362#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001363 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 return obj;
1365}
1366
1367#if SIZEOF_WCHAR_T == 2
1368/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1369 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001370 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371
1372 This function assumes that unicode can hold one more code point than wstr
1373 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001374static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001376 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377{
1378 const wchar_t *iter;
1379 Py_UCS4 *ucs4_out;
1380
Victor Stinner910337b2011-10-03 03:20:16 +02001381 assert(unicode != NULL);
1382 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1384 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1385
1386 for (iter = begin; iter < end; ) {
1387 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1388 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001389 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1390 && (iter+1) < end
1391 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 {
Victor Stinner551ac952011-11-29 22:58:13 +01001393 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 iter += 2;
1395 }
1396 else {
1397 *ucs4_out++ = *iter;
1398 iter++;
1399 }
1400 }
1401 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1402 _PyUnicode_GET_LENGTH(unicode)));
1403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404}
1405#endif
1406
Victor Stinnercd9950f2011-10-02 00:34:53 +02001407static int
Victor Stinner488fa492011-12-12 00:01:39 +01001408unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001409{
Victor Stinner488fa492011-12-12 00:01:39 +01001410 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001411 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001412 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001413 return -1;
1414 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001415 return 0;
1416}
1417
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001418static int
1419_copy_characters(PyObject *to, Py_ssize_t to_start,
1420 PyObject *from, Py_ssize_t from_start,
1421 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 unsigned int from_kind, to_kind;
1424 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425
Victor Stinneree4544c2012-05-09 22:24:08 +02001426 assert(0 <= how_many);
1427 assert(0 <= from_start);
1428 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001429 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001430 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001431 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432
Victor Stinnerd3f08822012-05-29 12:57:52 +02001433 assert(PyUnicode_Check(to));
1434 assert(PyUnicode_IS_READY(to));
1435 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1436
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 if (how_many == 0)
1438 return 0;
1439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001443 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444
Victor Stinnerf1852262012-06-16 16:38:26 +02001445#ifdef Py_DEBUG
1446 if (!check_maxchar
1447 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1448 {
1449 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1450 Py_UCS4 ch;
1451 Py_ssize_t i;
1452 for (i=0; i < how_many; i++) {
1453 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1454 assert(ch <= to_maxchar);
1455 }
1456 }
1457#endif
1458
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001459 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001460 if (check_maxchar
1461 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1462 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001463 /* Writing Latin-1 characters into an ASCII string requires to
1464 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001465 Py_UCS4 max_char;
1466 max_char = ucs1lib_find_max_char(from_data,
1467 (Py_UCS1*)from_data + how_many);
1468 if (max_char >= 128)
1469 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001470 }
Christian Heimesf051e432016-09-13 20:22:02 +02001471 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001472 (char*)from_data + from_kind * from_start,
1473 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001475 else if (from_kind == PyUnicode_1BYTE_KIND
1476 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS1, Py_UCS2,
1480 PyUnicode_1BYTE_DATA(from) + from_start,
1481 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_2BYTE_DATA(to) + to_start
1483 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001484 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001485 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 && to_kind == PyUnicode_4BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS1, Py_UCS4,
1490 PyUnicode_1BYTE_DATA(from) + from_start,
1491 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_4BYTE_DATA(to) + to_start
1493 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001494 }
1495 else if (from_kind == PyUnicode_2BYTE_KIND
1496 && to_kind == PyUnicode_4BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS2, Py_UCS4,
1500 PyUnicode_2BYTE_DATA(from) + from_start,
1501 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_4BYTE_DATA(to) + to_start
1503 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001504 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001505 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001506 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1507
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 if (!check_maxchar) {
1509 if (from_kind == PyUnicode_2BYTE_KIND
1510 && to_kind == PyUnicode_1BYTE_KIND)
1511 {
1512 _PyUnicode_CONVERT_BYTES(
1513 Py_UCS2, Py_UCS1,
1514 PyUnicode_2BYTE_DATA(from) + from_start,
1515 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1516 PyUnicode_1BYTE_DATA(to) + to_start
1517 );
1518 }
1519 else if (from_kind == PyUnicode_4BYTE_KIND
1520 && to_kind == PyUnicode_1BYTE_KIND)
1521 {
1522 _PyUnicode_CONVERT_BYTES(
1523 Py_UCS4, Py_UCS1,
1524 PyUnicode_4BYTE_DATA(from) + from_start,
1525 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1526 PyUnicode_1BYTE_DATA(to) + to_start
1527 );
1528 }
1529 else if (from_kind == PyUnicode_4BYTE_KIND
1530 && to_kind == PyUnicode_2BYTE_KIND)
1531 {
1532 _PyUnicode_CONVERT_BYTES(
1533 Py_UCS4, Py_UCS2,
1534 PyUnicode_4BYTE_DATA(from) + from_start,
1535 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1536 PyUnicode_2BYTE_DATA(to) + to_start
1537 );
1538 }
1539 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001540 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001541 }
1542 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001543 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001544 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001545 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001546 Py_ssize_t i;
1547
Victor Stinnera0702ab2011-09-29 14:14:38 +02001548 for (i=0; i < how_many; i++) {
1549 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001550 if (ch > to_maxchar)
1551 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1553 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001554 }
1555 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556 return 0;
1557}
1558
Victor Stinnerd3f08822012-05-29 12:57:52 +02001559void
1560_PyUnicode_FastCopyCharacters(
1561 PyObject *to, Py_ssize_t to_start,
1562 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563{
1564 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1565}
1566
1567Py_ssize_t
1568PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1569 PyObject *from, Py_ssize_t from_start,
1570 Py_ssize_t how_many)
1571{
1572 int err;
1573
1574 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1575 PyErr_BadInternalCall();
1576 return -1;
1577 }
1578
Benjamin Petersonbac79492012-01-14 13:34:47 -05001579 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001581 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001582 return -1;
1583
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001584 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001585 PyErr_SetString(PyExc_IndexError, "string index out of range");
1586 return -1;
1587 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001588 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001589 PyErr_SetString(PyExc_IndexError, "string index out of range");
1590 return -1;
1591 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001592 if (how_many < 0) {
1593 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1594 return -1;
1595 }
1596 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001597 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1598 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001599 "Cannot write %zi characters at %zi "
1600 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 how_many, to_start, PyUnicode_GET_LENGTH(to));
1602 return -1;
1603 }
1604
1605 if (how_many == 0)
1606 return 0;
1607
Victor Stinner488fa492011-12-12 00:01:39 +01001608 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001609 return -1;
1610
1611 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1612 if (err) {
1613 PyErr_Format(PyExc_SystemError,
1614 "Cannot copy %s characters "
1615 "into a string of %s characters",
1616 unicode_kind_name(from),
1617 unicode_kind_name(to));
1618 return -1;
1619 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001620 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621}
1622
Victor Stinner17222162011-09-28 22:15:37 +02001623/* Find the maximum code point and count the number of surrogate pairs so a
1624 correct string length can be computed before converting a string to UCS4.
1625 This function counts single surrogates as a character and not as a pair.
1626
1627 Return 0 on success, or -1 on error. */
1628static int
1629find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1630 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631{
1632 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001633 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634
Victor Stinnerc53be962011-10-02 21:33:54 +02001635 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 *num_surrogates = 0;
1637 *maxchar = 0;
1638
1639 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001641 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1642 && (iter+1) < end
1643 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1644 {
1645 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1646 ++(*num_surrogates);
1647 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 }
1649 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001651 {
1652 ch = *iter;
1653 iter++;
1654 }
1655 if (ch > *maxchar) {
1656 *maxchar = ch;
1657 if (*maxchar > MAX_UNICODE) {
1658 PyErr_Format(PyExc_ValueError,
1659 "character U+%x is not in range [U+0000; U+10ffff]",
1660 ch);
1661 return -1;
1662 }
1663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 }
1665 return 0;
1666}
1667
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001668int
1669_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670{
1671 wchar_t *end;
1672 Py_UCS4 maxchar = 0;
1673 Py_ssize_t num_surrogates;
1674#if SIZEOF_WCHAR_T == 2
1675 Py_ssize_t length_wo_surrogates;
1676#endif
1677
Georg Brandl7597add2011-10-05 16:36:47 +02001678 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001679 strings were created using _PyObject_New() and where no canonical
1680 representation (the str field) has been set yet aka strings
1681 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001682 assert(_PyUnicode_CHECK(unicode));
1683 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001685 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001686 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001687 /* Actually, it should neither be interned nor be anything else: */
1688 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001691 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001692 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694
1695 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001696 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1697 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 PyErr_NoMemory();
1699 return -1;
1700 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001701 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 _PyUnicode_WSTR(unicode), end,
1703 PyUnicode_1BYTE_DATA(unicode));
1704 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1705 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1706 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1707 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001708 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001709 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001710 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
1712 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001713 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001714 _PyUnicode_UTF8(unicode) = NULL;
1715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 }
1717 PyObject_FREE(_PyUnicode_WSTR(unicode));
1718 _PyUnicode_WSTR(unicode) = NULL;
1719 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1720 }
1721 /* In this case we might have to convert down from 4-byte native
1722 wchar_t to 2-byte unicode. */
1723 else if (maxchar < 65536) {
1724 assert(num_surrogates == 0 &&
1725 "FindMaxCharAndNumSurrogatePairs() messed up");
1726
Victor Stinner506f5922011-09-28 22:34:18 +02001727#if SIZEOF_WCHAR_T == 2
1728 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001730 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1731 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1732 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001733 _PyUnicode_UTF8(unicode) = NULL;
1734 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001735#else
1736 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001738 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001739 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001740 PyErr_NoMemory();
1741 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 }
Victor Stinner506f5922011-09-28 22:34:18 +02001743 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1744 _PyUnicode_WSTR(unicode), end,
1745 PyUnicode_2BYTE_DATA(unicode));
1746 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1747 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1748 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001751 PyObject_FREE(_PyUnicode_WSTR(unicode));
1752 _PyUnicode_WSTR(unicode) = NULL;
1753 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1754#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 }
1756 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1757 else {
1758#if SIZEOF_WCHAR_T == 2
1759 /* in case the native representation is 2-bytes, we need to allocate a
1760 new normalized 4-byte version. */
1761 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001762 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1763 PyErr_NoMemory();
1764 return -1;
1765 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001766 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1767 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 PyErr_NoMemory();
1769 return -1;
1770 }
1771 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1772 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001773 _PyUnicode_UTF8(unicode) = NULL;
1774 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001775 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1776 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001777 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 PyObject_FREE(_PyUnicode_WSTR(unicode));
1779 _PyUnicode_WSTR(unicode) = NULL;
1780 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1781#else
1782 assert(num_surrogates == 0);
1783
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 _PyUnicode_UTF8(unicode) = NULL;
1787 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1789#endif
1790 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1791 }
1792 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001793 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 return 0;
1795}
1796
Alexander Belopolsky40018472011-02-26 01:02:56 +00001797static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001798unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799{
Walter Dörwald16807132007-05-25 13:52:07 +00001800 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 case SSTATE_NOT_INTERNED:
1802 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001803
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 case SSTATE_INTERNED_MORTAL:
1805 /* revive dead object temporarily for DelItem */
1806 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001807 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 Py_FatalError(
1809 "deletion of interned string failed");
1810 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001811
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 case SSTATE_INTERNED_IMMORTAL:
1813 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001814 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001815
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 default:
1817 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001818 }
1819
Victor Stinner03490912011-10-03 23:45:12 +02001820 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001822 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001823 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001824 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1825 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001827 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828}
1829
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001830#ifdef Py_DEBUG
1831static int
1832unicode_is_singleton(PyObject *unicode)
1833{
1834 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1835 if (unicode == unicode_empty)
1836 return 1;
1837 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1838 {
1839 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1840 if (ch < 256 && unicode_latin1[ch] == unicode)
1841 return 1;
1842 }
1843 return 0;
1844}
1845#endif
1846
Alexander Belopolsky40018472011-02-26 01:02:56 +00001847static int
Victor Stinner488fa492011-12-12 00:01:39 +01001848unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849{
Victor Stinner488fa492011-12-12 00:01:39 +01001850 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001851 if (Py_REFCNT(unicode) != 1)
1852 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001853 if (_PyUnicode_HASH(unicode) != -1)
1854 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001855 if (PyUnicode_CHECK_INTERNED(unicode))
1856 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001857 if (!PyUnicode_CheckExact(unicode))
1858 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001859#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001860 /* singleton refcount is greater than 1 */
1861 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001862#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 return 1;
1864}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001865
Victor Stinnerfe226c02011-10-03 03:52:20 +02001866static int
1867unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1868{
1869 PyObject *unicode;
1870 Py_ssize_t old_length;
1871
1872 assert(p_unicode != NULL);
1873 unicode = *p_unicode;
1874
1875 assert(unicode != NULL);
1876 assert(PyUnicode_Check(unicode));
1877 assert(0 <= length);
1878
Victor Stinner910337b2011-10-03 03:20:16 +02001879 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880 old_length = PyUnicode_WSTR_LENGTH(unicode);
1881 else
1882 old_length = PyUnicode_GET_LENGTH(unicode);
1883 if (old_length == length)
1884 return 0;
1885
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001886 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001887 _Py_INCREF_UNICODE_EMPTY();
1888 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001889 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001890 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001891 return 0;
1892 }
1893
Victor Stinner488fa492011-12-12 00:01:39 +01001894 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895 PyObject *copy = resize_copy(unicode, length);
1896 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001898 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001899 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001900 }
1901
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001903 PyObject *new_unicode = resize_compact(unicode, length);
1904 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001905 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001906 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001907 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001908 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001909 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910}
1911
Alexander Belopolsky40018472011-02-26 01:02:56 +00001912int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001914{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001915 PyObject *unicode;
1916 if (p_unicode == NULL) {
1917 PyErr_BadInternalCall();
1918 return -1;
1919 }
1920 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001921 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001922 {
1923 PyErr_BadInternalCall();
1924 return -1;
1925 }
1926 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001927}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001928
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001929/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001930
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001931 WARNING: The function doesn't copy the terminating null character and
1932 doesn't check the maximum character (may write a latin1 character in an
1933 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001934static void
1935unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1936 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001937{
1938 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1939 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001940 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001941
1942 switch (kind) {
1943 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001945#ifdef Py_DEBUG
1946 if (PyUnicode_IS_ASCII(unicode)) {
1947 Py_UCS4 maxchar = ucs1lib_find_max_char(
1948 (const Py_UCS1*)str,
1949 (const Py_UCS1*)str + len);
1950 assert(maxchar < 128);
1951 }
1952#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001953 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001954 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001955 }
1956 case PyUnicode_2BYTE_KIND: {
1957 Py_UCS2 *start = (Py_UCS2 *)data + index;
1958 Py_UCS2 *ucs2 = start;
1959 assert(index <= PyUnicode_GET_LENGTH(unicode));
1960
Victor Stinner184252a2012-06-16 02:57:41 +02001961 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001962 *ucs2 = (Py_UCS2)*str;
1963
1964 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001965 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001966 }
1967 default: {
1968 Py_UCS4 *start = (Py_UCS4 *)data + index;
1969 Py_UCS4 *ucs4 = start;
1970 assert(kind == PyUnicode_4BYTE_KIND);
1971 assert(index <= PyUnicode_GET_LENGTH(unicode));
1972
Victor Stinner184252a2012-06-16 02:57:41 +02001973 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001974 *ucs4 = (Py_UCS4)*str;
1975
1976 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001977 }
1978 }
1979}
1980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981static PyObject*
1982get_latin1_char(unsigned char ch)
1983{
Victor Stinnera464fc12011-10-02 20:39:30 +02001984 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001986 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 if (!unicode)
1988 return NULL;
1989 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001990 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 unicode_latin1[ch] = unicode;
1992 }
1993 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001994 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995}
1996
Victor Stinner985a82a2014-01-03 12:53:47 +01001997static PyObject*
1998unicode_char(Py_UCS4 ch)
1999{
2000 PyObject *unicode;
2001
2002 assert(ch <= MAX_UNICODE);
2003
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002004 if (ch < 256)
2005 return get_latin1_char(ch);
2006
Victor Stinner985a82a2014-01-03 12:53:47 +01002007 unicode = PyUnicode_New(1, ch);
2008 if (unicode == NULL)
2009 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002010
2011 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2012 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002013 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002014 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002015 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2016 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2017 }
2018 assert(_PyUnicode_CheckConsistency(unicode, 1));
2019 return unicode;
2020}
2021
Alexander Belopolsky40018472011-02-26 01:02:56 +00002022PyObject *
2023PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL)
2026 return (PyObject*)_PyUnicode_New(size);
2027
2028 if (size < 0) {
2029 PyErr_BadInternalCall();
2030 return NULL;
2031 }
2032
2033 return PyUnicode_FromWideChar(u, size);
2034}
2035
2036PyObject *
2037PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2038{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002039 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 Py_UCS4 maxchar = 0;
2041 Py_ssize_t num_surrogates;
2042
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002043 if (u == NULL && size != 0) {
2044 PyErr_BadInternalCall();
2045 return NULL;
2046 }
2047
2048 if (size == -1) {
2049 size = wcslen(u);
2050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002052 /* If the Unicode data is known at construction time, we can apply
2053 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002056 if (size == 0)
2057 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 /* Single character Unicode objects in the Latin-1 range are
2060 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002061 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 return get_latin1_char((unsigned char)*u);
2063
2064 /* If not empty and not single character, copy the Unicode data
2065 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002066 if (find_maxchar_surrogates(u, u + size,
2067 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 return NULL;
2069
Victor Stinner8faf8212011-12-08 22:14:11 +01002070 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 if (!unicode)
2072 return NULL;
2073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 switch (PyUnicode_KIND(unicode)) {
2075 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002076 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2078 break;
2079 case PyUnicode_2BYTE_KIND:
2080#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002081 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002083 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2085#endif
2086 break;
2087 case PyUnicode_4BYTE_KIND:
2088#if SIZEOF_WCHAR_T == 2
2089 /* This is the only case which has to process surrogates, thus
2090 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002091 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092#else
2093 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002094 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095#endif
2096 break;
2097 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002098 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002101 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102}
2103
Alexander Belopolsky40018472011-02-26 01:02:56 +00002104PyObject *
2105PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002106{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002107 if (size < 0) {
2108 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002110 return NULL;
2111 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002112 if (u != NULL)
2113 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2114 else
2115 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002116}
2117
Alexander Belopolsky40018472011-02-26 01:02:56 +00002118PyObject *
2119PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002120{
2121 size_t size = strlen(u);
2122 if (size > PY_SSIZE_T_MAX) {
2123 PyErr_SetString(PyExc_OverflowError, "input too long");
2124 return NULL;
2125 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002126 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002127}
2128
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002129PyObject *
2130_PyUnicode_FromId(_Py_Identifier *id)
2131{
2132 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002133 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2134 strlen(id->string),
2135 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002136 if (!id->object)
2137 return NULL;
2138 PyUnicode_InternInPlace(&id->object);
2139 assert(!id->next);
2140 id->next = static_strings;
2141 static_strings = id;
2142 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002143 return id->object;
2144}
2145
2146void
2147_PyUnicode_ClearStaticStrings()
2148{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002149 _Py_Identifier *tmp, *s = static_strings;
2150 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002151 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002152 tmp = s->next;
2153 s->next = NULL;
2154 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002155 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002156 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002157}
2158
Benjamin Peterson0df54292012-03-26 14:50:32 -04002159/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002160
Victor Stinnerd3f08822012-05-29 12:57:52 +02002161PyObject*
2162_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002163{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002164 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002165 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002166 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002167#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002168 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002169#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002170 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002171 }
Victor Stinner785938e2011-12-11 20:09:03 +01002172 unicode = PyUnicode_New(size, 127);
2173 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002174 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002175 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2176 assert(_PyUnicode_CheckConsistency(unicode, 1));
2177 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002178}
2179
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002180static Py_UCS4
2181kind_maxchar_limit(unsigned int kind)
2182{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002183 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002184 case PyUnicode_1BYTE_KIND:
2185 return 0x80;
2186 case PyUnicode_2BYTE_KIND:
2187 return 0x100;
2188 case PyUnicode_4BYTE_KIND:
2189 return 0x10000;
2190 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002191 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002192 }
2193}
2194
Victor Stinner702c7342011-10-05 13:50:52 +02002195static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002196_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002200
Serhiy Storchaka678db842013-01-26 12:16:36 +02002201 if (size == 0)
2202 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002203 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002204 if (size == 1)
2205 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002206
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002207 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 if (!res)
2210 return NULL;
2211 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002212 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002214}
2215
Victor Stinnere57b1c02011-09-28 22:20:48 +02002216static PyObject*
2217_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218{
2219 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002221
Serhiy Storchaka678db842013-01-26 12:16:36 +02002222 if (size == 0)
2223 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002224 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002225 if (size == 1)
2226 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002227
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002228 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 if (!res)
2231 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002232 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002234 else {
2235 _PyUnicode_CONVERT_BYTES(
2236 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2237 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002238 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 return res;
2240}
2241
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242static PyObject*
2243_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244{
2245 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002247
Serhiy Storchaka678db842013-01-26 12:16:36 +02002248 if (size == 0)
2249 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002250 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002251 if (size == 1)
2252 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002254 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 if (!res)
2257 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002258 if (max_char < 256)
2259 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2260 PyUnicode_1BYTE_DATA(res));
2261 else if (max_char < 0x10000)
2262 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2263 PyUnicode_2BYTE_DATA(res));
2264 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002266 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 return res;
2268}
2269
2270PyObject*
2271PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2272{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002273 if (size < 0) {
2274 PyErr_SetString(PyExc_ValueError, "size must be positive");
2275 return NULL;
2276 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002277 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002281 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002283 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002284 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002285 PyErr_SetString(PyExc_SystemError, "invalid kind");
2286 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288}
2289
Victor Stinnerece58de2012-04-23 23:36:38 +02002290Py_UCS4
2291_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2292{
2293 enum PyUnicode_Kind kind;
2294 void *startptr, *endptr;
2295
2296 assert(PyUnicode_IS_READY(unicode));
2297 assert(0 <= start);
2298 assert(end <= PyUnicode_GET_LENGTH(unicode));
2299 assert(start <= end);
2300
2301 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2302 return PyUnicode_MAX_CHAR_VALUE(unicode);
2303
2304 if (start == end)
2305 return 127;
2306
Victor Stinner94d558b2012-04-27 22:26:58 +02002307 if (PyUnicode_IS_ASCII(unicode))
2308 return 127;
2309
Victor Stinnerece58de2012-04-23 23:36:38 +02002310 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002311 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002312 endptr = (char *)startptr + end * kind;
2313 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002314 switch(kind) {
2315 case PyUnicode_1BYTE_KIND:
2316 return ucs1lib_find_max_char(startptr, endptr);
2317 case PyUnicode_2BYTE_KIND:
2318 return ucs2lib_find_max_char(startptr, endptr);
2319 case PyUnicode_4BYTE_KIND:
2320 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002321 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002322 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002323 }
2324}
2325
Victor Stinner25a4b292011-10-06 12:31:55 +02002326/* Ensure that a string uses the most efficient storage, if it is not the
2327 case: create a new string with of the right kind. Write NULL into *p_unicode
2328 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002329static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002330unicode_adjust_maxchar(PyObject **p_unicode)
2331{
2332 PyObject *unicode, *copy;
2333 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002334 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002335 unsigned int kind;
2336
2337 assert(p_unicode != NULL);
2338 unicode = *p_unicode;
2339 assert(PyUnicode_IS_READY(unicode));
2340 if (PyUnicode_IS_ASCII(unicode))
2341 return;
2342
2343 len = PyUnicode_GET_LENGTH(unicode);
2344 kind = PyUnicode_KIND(unicode);
2345 if (kind == PyUnicode_1BYTE_KIND) {
2346 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002347 max_char = ucs1lib_find_max_char(u, u + len);
2348 if (max_char >= 128)
2349 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002350 }
2351 else if (kind == PyUnicode_2BYTE_KIND) {
2352 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002353 max_char = ucs2lib_find_max_char(u, u + len);
2354 if (max_char >= 256)
2355 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 }
2357 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002359 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002360 max_char = ucs4lib_find_max_char(u, u + len);
2361 if (max_char >= 0x10000)
2362 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002365 if (copy != NULL)
2366 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002367 Py_DECREF(unicode);
2368 *p_unicode = copy;
2369}
2370
Victor Stinner034f6cf2011-09-30 02:26:44 +02002371PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002372_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002373{
Victor Stinner87af4f22011-11-21 23:03:47 +01002374 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002375 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002376
Victor Stinner034f6cf2011-09-30 02:26:44 +02002377 if (!PyUnicode_Check(unicode)) {
2378 PyErr_BadInternalCall();
2379 return NULL;
2380 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002381 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002382 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383
Victor Stinner87af4f22011-11-21 23:03:47 +01002384 length = PyUnicode_GET_LENGTH(unicode);
2385 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002386 if (!copy)
2387 return NULL;
2388 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2389
Christian Heimesf051e432016-09-13 20:22:02 +02002390 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002391 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002392 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002393 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002394}
2395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
Victor Stinnerbc603d12011-10-02 01:00:40 +02002397/* Widen Unicode objects to larger buffers. Don't write terminating null
2398 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399
2400void*
2401_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2402{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002403 Py_ssize_t len;
2404 void *result;
2405 unsigned int skind;
2406
Benjamin Petersonbac79492012-01-14 13:34:47 -05002407 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 return NULL;
2409
2410 len = PyUnicode_GET_LENGTH(s);
2411 skind = PyUnicode_KIND(s);
2412 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002413 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 return NULL;
2415 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002416 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002417 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002418 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002419 if (!result)
2420 return PyErr_NoMemory();
2421 assert(skind == PyUnicode_1BYTE_KIND);
2422 _PyUnicode_CONVERT_BYTES(
2423 Py_UCS1, Py_UCS2,
2424 PyUnicode_1BYTE_DATA(s),
2425 PyUnicode_1BYTE_DATA(s) + len,
2426 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002428 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002429 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002430 if (!result)
2431 return PyErr_NoMemory();
2432 if (skind == PyUnicode_2BYTE_KIND) {
2433 _PyUnicode_CONVERT_BYTES(
2434 Py_UCS2, Py_UCS4,
2435 PyUnicode_2BYTE_DATA(s),
2436 PyUnicode_2BYTE_DATA(s) + len,
2437 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002439 else {
2440 assert(skind == PyUnicode_1BYTE_KIND);
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS1, Py_UCS4,
2443 PyUnicode_1BYTE_DATA(s),
2444 PyUnicode_1BYTE_DATA(s) + len,
2445 result);
2446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448 default:
2449 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 }
Victor Stinner01698042011-10-04 00:04:26 +02002451 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 return NULL;
2453}
2454
2455static Py_UCS4*
2456as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2457 int copy_null)
2458{
2459 int kind;
2460 void *data;
2461 Py_ssize_t len, targetlen;
2462 if (PyUnicode_READY(string) == -1)
2463 return NULL;
2464 kind = PyUnicode_KIND(string);
2465 data = PyUnicode_DATA(string);
2466 len = PyUnicode_GET_LENGTH(string);
2467 targetlen = len;
2468 if (copy_null)
2469 targetlen++;
2470 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002471 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 if (!target) {
2473 PyErr_NoMemory();
2474 return NULL;
2475 }
2476 }
2477 else {
2478 if (targetsize < targetlen) {
2479 PyErr_Format(PyExc_SystemError,
2480 "string is longer than the buffer");
2481 if (copy_null && 0 < targetsize)
2482 target[0] = 0;
2483 return NULL;
2484 }
2485 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002486 if (kind == PyUnicode_1BYTE_KIND) {
2487 Py_UCS1 *start = (Py_UCS1 *) data;
2488 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002490 else if (kind == PyUnicode_2BYTE_KIND) {
2491 Py_UCS2 *start = (Py_UCS2 *) data;
2492 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2493 }
2494 else {
2495 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002496 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 if (copy_null)
2499 target[len] = 0;
2500 return target;
2501}
2502
2503Py_UCS4*
2504PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2505 int copy_null)
2506{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002507 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 PyErr_BadInternalCall();
2509 return NULL;
2510 }
2511 return as_ucs4(string, target, targetsize, copy_null);
2512}
2513
2514Py_UCS4*
2515PyUnicode_AsUCS4Copy(PyObject *string)
2516{
2517 return as_ucs4(string, NULL, 0, 1);
2518}
2519
Victor Stinner15a11362012-10-06 23:48:20 +02002520/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002521 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2522 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2523#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002524
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002525static int
2526unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2527 Py_ssize_t width, Py_ssize_t precision)
2528{
2529 Py_ssize_t length, fill, arglen;
2530 Py_UCS4 maxchar;
2531
2532 if (PyUnicode_READY(str) == -1)
2533 return -1;
2534
2535 length = PyUnicode_GET_LENGTH(str);
2536 if ((precision == -1 || precision >= length)
2537 && width <= length)
2538 return _PyUnicodeWriter_WriteStr(writer, str);
2539
2540 if (precision != -1)
2541 length = Py_MIN(precision, length);
2542
2543 arglen = Py_MAX(length, width);
2544 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2545 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2546 else
2547 maxchar = writer->maxchar;
2548
2549 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2550 return -1;
2551
2552 if (width > length) {
2553 fill = width - length;
2554 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2555 return -1;
2556 writer->pos += fill;
2557 }
2558
2559 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2560 str, 0, length);
2561 writer->pos += length;
2562 return 0;
2563}
2564
2565static int
2566unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2567 Py_ssize_t width, Py_ssize_t precision)
2568{
2569 /* UTF-8 */
2570 Py_ssize_t length;
2571 PyObject *unicode;
2572 int res;
2573
2574 length = strlen(str);
2575 if (precision != -1)
2576 length = Py_MIN(length, precision);
2577 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2578 if (unicode == NULL)
2579 return -1;
2580
2581 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2582 Py_DECREF(unicode);
2583 return res;
2584}
2585
Victor Stinner96865452011-03-01 23:44:09 +00002586static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002587unicode_fromformat_arg(_PyUnicodeWriter *writer,
2588 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002589{
Victor Stinnere215d962012-10-06 23:03:36 +02002590 const char *p;
2591 Py_ssize_t len;
2592 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 Py_ssize_t width;
2594 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 int longflag;
2596 int longlongflag;
2597 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002599
2600 p = f;
2601 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002602 zeropad = 0;
2603 if (*f == '0') {
2604 zeropad = 1;
2605 f++;
2606 }
Victor Stinner96865452011-03-01 23:44:09 +00002607
2608 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 width = -1;
2610 if (Py_ISDIGIT((unsigned)*f)) {
2611 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002612 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002613 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002615 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002617 return NULL;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002620 f++;
2621 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 }
2623 precision = -1;
2624 if (*f == '.') {
2625 f++;
2626 if (Py_ISDIGIT((unsigned)*f)) {
2627 precision = (*f - '0');
2628 f++;
2629 while (Py_ISDIGIT((unsigned)*f)) {
2630 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2631 PyErr_SetString(PyExc_ValueError,
2632 "precision too big");
2633 return NULL;
2634 }
2635 precision = (precision * 10) + (*f - '0');
2636 f++;
2637 }
2638 }
Victor Stinner96865452011-03-01 23:44:09 +00002639 if (*f == '%') {
2640 /* "%.3%s" => f points to "3" */
2641 f--;
2642 }
2643 }
2644 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002645 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002646 f--;
2647 }
Victor Stinner96865452011-03-01 23:44:09 +00002648
2649 /* Handle %ld, %lu, %lld and %llu. */
2650 longflag = 0;
2651 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002652 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002653 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002654 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002655 longflag = 1;
2656 ++f;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002659 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002660 longlongflag = 1;
2661 f += 2;
2662 }
Victor Stinner96865452011-03-01 23:44:09 +00002663 }
2664 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002665 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002666 size_tflag = 1;
2667 ++f;
2668 }
Victor Stinnere215d962012-10-06 23:03:36 +02002669
2670 if (f[1] == '\0')
2671 writer->overallocate = 0;
2672
2673 switch (*f) {
2674 case 'c':
2675 {
2676 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002677 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002678 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002679 "character argument not in range(0x110000)");
2680 return NULL;
2681 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002682 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002683 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002684 break;
2685 }
2686
2687 case 'i':
2688 case 'd':
2689 case 'u':
2690 case 'x':
2691 {
2692 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002693 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002694 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002695
2696 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002697 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002698 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002699 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002701 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002702 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002703 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, size_t));
2706 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002707 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002708 va_arg(*vargs, unsigned int));
2709 }
2710 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002712 }
2713 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002714 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002715 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002716 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002718 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002719 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002720 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, Py_ssize_t));
2723 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002724 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002725 va_arg(*vargs, int));
2726 }
2727 assert(len >= 0);
2728
Victor Stinnere215d962012-10-06 23:03:36 +02002729 if (precision < len)
2730 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731
2732 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002733 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2734 return NULL;
2735
Victor Stinnere215d962012-10-06 23:03:36 +02002736 if (width > precision) {
2737 Py_UCS4 fillchar;
2738 fill = width - precision;
2739 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002740 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2741 return NULL;
2742 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002743 }
Victor Stinner15a11362012-10-06 23:48:20 +02002744 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002745 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002746 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2747 return NULL;
2748 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002749 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002750
Victor Stinner4a587072013-11-19 12:54:53 +01002751 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2752 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002753 break;
2754 }
2755
2756 case 'p':
2757 {
2758 char number[MAX_LONG_LONG_CHARS];
2759
2760 len = sprintf(number, "%p", va_arg(*vargs, void*));
2761 assert(len >= 0);
2762
2763 /* %p is ill-defined: ensure leading 0x. */
2764 if (number[1] == 'X')
2765 number[1] = 'x';
2766 else if (number[1] != 'x') {
2767 memmove(number + 2, number,
2768 strlen(number) + 1);
2769 number[0] = '0';
2770 number[1] = 'x';
2771 len += 2;
2772 }
2773
Victor Stinner4a587072013-11-19 12:54:53 +01002774 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002775 return NULL;
2776 break;
2777 }
2778
2779 case 's':
2780 {
2781 /* UTF-8 */
2782 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002783 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002784 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002785 break;
2786 }
2787
2788 case 'U':
2789 {
2790 PyObject *obj = va_arg(*vargs, PyObject *);
2791 assert(obj && _PyUnicode_CHECK(obj));
2792
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002793 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002794 return NULL;
2795 break;
2796 }
2797
2798 case 'V':
2799 {
2800 PyObject *obj = va_arg(*vargs, PyObject *);
2801 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002802 if (obj) {
2803 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002805 return NULL;
2806 }
2807 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002808 assert(str != NULL);
2809 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002810 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002811 }
2812 break;
2813 }
2814
2815 case 'S':
2816 {
2817 PyObject *obj = va_arg(*vargs, PyObject *);
2818 PyObject *str;
2819 assert(obj);
2820 str = PyObject_Str(obj);
2821 if (!str)
2822 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002823 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002824 Py_DECREF(str);
2825 return NULL;
2826 }
2827 Py_DECREF(str);
2828 break;
2829 }
2830
2831 case 'R':
2832 {
2833 PyObject *obj = va_arg(*vargs, PyObject *);
2834 PyObject *repr;
2835 assert(obj);
2836 repr = PyObject_Repr(obj);
2837 if (!repr)
2838 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002839 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002840 Py_DECREF(repr);
2841 return NULL;
2842 }
2843 Py_DECREF(repr);
2844 break;
2845 }
2846
2847 case 'A':
2848 {
2849 PyObject *obj = va_arg(*vargs, PyObject *);
2850 PyObject *ascii;
2851 assert(obj);
2852 ascii = PyObject_ASCII(obj);
2853 if (!ascii)
2854 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002855 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002856 Py_DECREF(ascii);
2857 return NULL;
2858 }
2859 Py_DECREF(ascii);
2860 break;
2861 }
2862
2863 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002864 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002865 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002866 break;
2867
2868 default:
2869 /* if we stumble upon an unknown formatting code, copy the rest
2870 of the format string to the output string. (we cannot just
2871 skip the code, since there's no way to know what's in the
2872 argument list) */
2873 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002874 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002875 return NULL;
2876 f = p+len;
2877 return f;
2878 }
2879
2880 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002881 return f;
2882}
2883
Walter Dörwaldd2034312007-05-18 16:29:38 +00002884PyObject *
2885PyUnicode_FromFormatV(const char *format, va_list vargs)
2886{
Victor Stinnere215d962012-10-06 23:03:36 +02002887 va_list vargs2;
2888 const char *f;
2889 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002890
Victor Stinner8f674cc2013-04-17 23:02:17 +02002891 _PyUnicodeWriter_Init(&writer);
2892 writer.min_length = strlen(format) + 100;
2893 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002894
Benjamin Peterson0c212142016-09-20 20:39:33 -07002895 // Copy varags to be able to pass a reference to a subfunction.
2896 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002897
2898 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002899 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002900 f = unicode_fromformat_arg(&writer, f, &vargs2);
2901 if (f == NULL)
2902 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002905 const char *p;
2906 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002907
Victor Stinnere215d962012-10-06 23:03:36 +02002908 p = f;
2909 do
2910 {
2911 if ((unsigned char)*p > 127) {
2912 PyErr_Format(PyExc_ValueError,
2913 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2914 "string, got a non-ASCII byte: 0x%02x",
2915 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002916 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 }
2918 p++;
2919 }
2920 while (*p != '\0' && *p != '%');
2921 len = p - f;
2922
2923 if (*p == '\0')
2924 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002925
2926 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002927 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002928
2929 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002931 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002932 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002933 return _PyUnicodeWriter_Finish(&writer);
2934
2935 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002936 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002937 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002939}
2940
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941PyObject *
2942PyUnicode_FromFormat(const char *format, ...)
2943{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 PyObject* ret;
2945 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946
2947#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002948 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002949#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002951#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 ret = PyUnicode_FromFormatV(format, vargs);
2953 va_end(vargs);
2954 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002955}
2956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957#ifdef HAVE_WCHAR_H
2958
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002959/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 character) required to convert the unicode object. Ignore size argument.
2963
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002966 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002967Py_ssize_t
2968PyUnicode_AsWideChar(PyObject *unicode,
2969 wchar_t *w,
2970 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002971{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002973 const wchar_t *wstr;
2974
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002975 if (unicode == NULL) {
2976 PyErr_BadInternalCall();
2977 return -1;
2978 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002979 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 if (wstr == NULL)
2981 return -1;
2982
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002984 if (size > res)
2985 size = res + 1;
2986 else
2987 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002988 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002989 return res;
2990 }
2991 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002992 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002993}
2994
Victor Stinner137c34c2010-09-29 10:25:54 +00002995wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002996PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002997 Py_ssize_t *size)
2998{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002999 const wchar_t *wstr;
3000 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003001 Py_ssize_t buflen;
3002
3003 if (unicode == NULL) {
3004 PyErr_BadInternalCall();
3005 return NULL;
3006 }
3007
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003008 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3009 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003010 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003011 }
3012 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3013 PyErr_SetString(PyExc_ValueError,
3014 "embedded null character");
3015 return NULL;
3016 }
3017
3018 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003019 if (buffer == NULL) {
3020 PyErr_NoMemory();
3021 return NULL;
3022 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003023 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003024 if (size != NULL)
3025 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003026 return buffer;
3027}
3028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003029#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030
Alexander Belopolsky40018472011-02-26 01:02:56 +00003031PyObject *
3032PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003033{
Victor Stinner8faf8212011-12-08 22:14:11 +01003034 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 PyErr_SetString(PyExc_ValueError,
3036 "chr() arg not in range(0x110000)");
3037 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003038 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003039
Victor Stinner985a82a2014-01-03 12:53:47 +01003040 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003041}
3042
Alexander Belopolsky40018472011-02-26 01:02:56 +00003043PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003044PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003046 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003049 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003050 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 Py_INCREF(obj);
3052 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003053 }
3054 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 /* For a Unicode subtype that's not a Unicode object,
3056 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003057 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003058 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003059 PyErr_Format(PyExc_TypeError,
3060 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003061 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003062 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003066PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003069{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003072
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 PyErr_BadInternalCall();
3075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 /* Decoding bytes objects is the most common case and should be fast */
3079 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003080 if (PyBytes_GET_SIZE(obj) == 0)
3081 _Py_RETURN_UNICODE_EMPTY();
3082 v = PyUnicode_Decode(
3083 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3084 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 return v;
3086 }
3087
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003088 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 PyErr_SetString(PyExc_TypeError,
3090 "decoding str is not supported");
3091 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003092 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003093
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003094 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3095 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3096 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003097 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003098 Py_TYPE(obj)->tp_name);
3099 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003100 }
Tim Petersced69f82003-09-16 20:30:58 +00003101
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003102 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003103 PyBuffer_Release(&buffer);
3104 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003106
Serhiy Storchaka05997252013-01-26 12:14:02 +02003107 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003108 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003109 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110}
3111
Victor Stinnerebe17e02016-10-12 13:57:45 +02003112/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3113 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3114 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003115int
3116_Py_normalize_encoding(const char *encoding,
3117 char *lower,
3118 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003120 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003121 char *l;
3122 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003123 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124
Victor Stinner942889a2016-09-05 15:40:10 -07003125 assert(encoding != NULL);
3126
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003127 e = encoding;
3128 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003129 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003130 punct = 0;
3131 while (1) {
3132 char c = *e;
3133 if (c == 0) {
3134 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 }
Victor Stinner942889a2016-09-05 15:40:10 -07003136
3137 if (Py_ISALNUM(c) || c == '.') {
3138 if (punct && l != lower) {
3139 if (l == l_end) {
3140 return 0;
3141 }
3142 *l++ = '_';
3143 }
3144 punct = 0;
3145
3146 if (l == l_end) {
3147 return 0;
3148 }
3149 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003150 }
3151 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003152 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003153 }
Victor Stinner942889a2016-09-05 15:40:10 -07003154
3155 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003156 }
3157 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003158 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003159}
3160
Alexander Belopolsky40018472011-02-26 01:02:56 +00003161PyObject *
3162PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003163 Py_ssize_t size,
3164 const char *encoding,
3165 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003166{
3167 PyObject *buffer = NULL, *unicode;
3168 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003169 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3170
3171 if (encoding == NULL) {
3172 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3173 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003174
Fred Drakee4315f52000-05-09 19:53:39 +00003175 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003176 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3177 char *lower = buflower;
3178
3179 /* Fast paths */
3180 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3181 lower += 3;
3182 if (*lower == '_') {
3183 /* Match "utf8" and "utf_8" */
3184 lower++;
3185 }
3186
3187 if (lower[0] == '8' && lower[1] == 0) {
3188 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3189 }
3190 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3191 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3192 }
3193 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3194 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3195 }
3196 }
3197 else {
3198 if (strcmp(lower, "ascii") == 0
3199 || strcmp(lower, "us_ascii") == 0) {
3200 return PyUnicode_DecodeASCII(s, size, errors);
3201 }
Steve Dowercc16be82016-09-08 10:35:16 -07003202 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003203 else if (strcmp(lower, "mbcs") == 0) {
3204 return PyUnicode_DecodeMBCS(s, size, errors);
3205 }
3206 #endif
3207 else if (strcmp(lower, "latin1") == 0
3208 || strcmp(lower, "latin_1") == 0
3209 || strcmp(lower, "iso_8859_1") == 0
3210 || strcmp(lower, "iso8859_1") == 0) {
3211 return PyUnicode_DecodeLatin1(s, size, errors);
3212 }
3213 }
Victor Stinner37296e82010-06-10 13:36:23 +00003214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215
3216 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003217 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003218 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003219 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003220 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 if (buffer == NULL)
3222 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003223 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 if (unicode == NULL)
3225 goto onError;
3226 if (!PyUnicode_Check(unicode)) {
3227 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003228 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3229 "use codecs.decode() to decode to arbitrary types",
3230 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003231 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 Py_DECREF(unicode);
3233 goto onError;
3234 }
3235 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003236 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003237
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 Py_XDECREF(buffer);
3240 return NULL;
3241}
3242
Alexander Belopolsky40018472011-02-26 01:02:56 +00003243PyObject *
3244PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003245 const char *encoding,
3246 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003247{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248 if (!PyUnicode_Check(unicode)) {
3249 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003250 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251 }
3252
Serhiy Storchaka00939072016-10-27 21:05:49 +03003253 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3254 "PyUnicode_AsDecodedObject() is deprecated; "
3255 "use PyCodec_Decode() to decode from str", 1) < 0)
3256 return NULL;
3257
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003259 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260
3261 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003262 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003263}
3264
Alexander Belopolsky40018472011-02-26 01:02:56 +00003265PyObject *
3266PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003267 const char *encoding,
3268 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003269{
3270 PyObject *v;
3271
3272 if (!PyUnicode_Check(unicode)) {
3273 PyErr_BadArgument();
3274 goto onError;
3275 }
3276
Serhiy Storchaka00939072016-10-27 21:05:49 +03003277 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3278 "PyUnicode_AsDecodedUnicode() is deprecated; "
3279 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3280 return NULL;
3281
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284
3285 /* Decode via the codec registry */
3286 v = PyCodec_Decode(unicode, encoding, errors);
3287 if (v == NULL)
3288 goto onError;
3289 if (!PyUnicode_Check(v)) {
3290 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003291 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3292 "use codecs.decode() to decode to arbitrary types",
3293 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003294 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003295 Py_DECREF(v);
3296 goto onError;
3297 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003298 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003299
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003301 return NULL;
3302}
3303
Alexander Belopolsky40018472011-02-26 01:02:56 +00003304PyObject *
3305PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003306 Py_ssize_t size,
3307 const char *encoding,
3308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309{
3310 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003311
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003312 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3316 Py_DECREF(unicode);
3317 return v;
3318}
3319
Alexander Belopolsky40018472011-02-26 01:02:56 +00003320PyObject *
3321PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003322 const char *encoding,
3323 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003324{
3325 PyObject *v;
3326
3327 if (!PyUnicode_Check(unicode)) {
3328 PyErr_BadArgument();
3329 goto onError;
3330 }
3331
Serhiy Storchaka00939072016-10-27 21:05:49 +03003332 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3333 "PyUnicode_AsEncodedObject() is deprecated; "
3334 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3335 "or PyCodec_Encode() for generic encoding", 1) < 0)
3336 return NULL;
3337
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003338 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003340
3341 /* Encode via the codec registry */
3342 v = PyCodec_Encode(unicode, encoding, errors);
3343 if (v == NULL)
3344 goto onError;
3345 return v;
3346
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348 return NULL;
3349}
3350
Victor Stinner1b579672011-12-17 05:47:23 +01003351static int
3352locale_error_handler(const char *errors, int *surrogateescape)
3353{
Victor Stinner50149202015-09-22 00:26:54 +02003354 _Py_error_handler error_handler = get_error_handler(errors);
3355 switch (error_handler)
3356 {
3357 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003358 *surrogateescape = 0;
3359 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003360 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003361 *surrogateescape = 1;
3362 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003363 default:
3364 PyErr_Format(PyExc_ValueError,
3365 "only 'strict' and 'surrogateescape' error handlers "
3366 "are supported, not '%s'",
3367 errors);
3368 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003369 }
Victor Stinner1b579672011-12-17 05:47:23 +01003370}
3371
Victor Stinner2cba6b82018-01-10 22:46:15 +01003372static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003373unicode_encode_locale(PyObject *unicode, const char *errors,
3374 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375{
Victor Stinner1b579672011-12-17 05:47:23 +01003376 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003377 if (locale_error_handler(errors, &surrogateescape) < 0)
3378 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003380 Py_ssize_t wlen;
3381 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3382 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003384 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003385
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003386 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387 if (wlen2 != wlen) {
3388 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003389 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 return NULL;
3391 }
3392
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003393 char *str;
3394 size_t error_pos;
3395 const char *reason;
3396 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3397 current_locale, surrogateescape);
3398 if (res != 0) {
3399 if (res == -2) {
3400 PyObject *exc;
3401 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3402 "locale", unicode,
3403 (Py_ssize_t)error_pos,
3404 (Py_ssize_t)(error_pos+1),
3405 reason);
3406 if (exc != NULL) {
3407 PyCodec_StrictErrors(exc);
3408 Py_DECREF(exc);
3409 }
3410 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003411 }
3412 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003413 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414 PyMem_Free(wstr);
3415 return NULL;
3416 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003420 PyObject *bytes = PyBytes_FromString(str);
3421 PyMem_RawFree(str);
3422 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423}
3424
Victor Stinnerad158722010-10-27 00:25:46 +00003425PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003426PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3427{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003428 return unicode_encode_locale(unicode, errors, 1);
3429}
3430
3431PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003432PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003433{
Steve Dowercc16be82016-09-08 10:35:16 -07003434#if defined(__APPLE__)
3435 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003436#else
Victor Stinner793b5312011-04-27 00:24:21 +02003437 PyInterpreterState *interp = PyThreadState_GET()->interp;
3438 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3439 cannot use it to encode and decode filenames before it is loaded. Load
3440 the Python codec requires to encode at least its own filename. Use the C
3441 version of the locale codec until the codec registry is initialized and
3442 the Python codec is loaded.
3443
3444 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3445 cannot only rely on it: check also interp->fscodec_initialized for
3446 subinterpreters. */
3447 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003448 return PyUnicode_AsEncodedString(unicode,
3449 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003450 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003451 }
3452 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003453 return unicode_encode_locale(unicode,
3454 Py_FileSystemDefaultEncodeErrors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003455 }
Victor Stinnerad158722010-10-27 00:25:46 +00003456#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003457}
3458
Alexander Belopolsky40018472011-02-26 01:02:56 +00003459PyObject *
3460PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003461 const char *encoding,
3462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463{
3464 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003465 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003466
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 if (!PyUnicode_Check(unicode)) {
3468 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 }
Fred Drakee4315f52000-05-09 19:53:39 +00003471
Victor Stinner942889a2016-09-05 15:40:10 -07003472 if (encoding == NULL) {
3473 return _PyUnicode_AsUTF8String(unicode, errors);
3474 }
3475
Fred Drakee4315f52000-05-09 19:53:39 +00003476 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003477 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3478 char *lower = buflower;
3479
3480 /* Fast paths */
3481 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3482 lower += 3;
3483 if (*lower == '_') {
3484 /* Match "utf8" and "utf_8" */
3485 lower++;
3486 }
3487
3488 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003489 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003490 }
3491 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3492 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3493 }
3494 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3495 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3496 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003497 }
Victor Stinner942889a2016-09-05 15:40:10 -07003498 else {
3499 if (strcmp(lower, "ascii") == 0
3500 || strcmp(lower, "us_ascii") == 0) {
3501 return _PyUnicode_AsASCIIString(unicode, errors);
3502 }
Steve Dowercc16be82016-09-08 10:35:16 -07003503#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003504 else if (strcmp(lower, "mbcs") == 0) {
3505 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3506 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003507#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003508 else if (strcmp(lower, "latin1") == 0 ||
3509 strcmp(lower, "latin_1") == 0 ||
3510 strcmp(lower, "iso_8859_1") == 0 ||
3511 strcmp(lower, "iso8859_1") == 0) {
3512 return _PyUnicode_AsLatin1String(unicode, errors);
3513 }
3514 }
Victor Stinner37296e82010-06-10 13:36:23 +00003515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516
3517 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003518 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003520 return NULL;
3521
3522 /* The normal path */
3523 if (PyBytes_Check(v))
3524 return v;
3525
3526 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003527 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003528 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003529 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003530
3531 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003532 "encoder %s returned bytearray instead of bytes; "
3533 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003534 encoding);
3535 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003536 Py_DECREF(v);
3537 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003538 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003540 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3541 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003542 Py_DECREF(v);
3543 return b;
3544 }
3545
3546 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003547 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3548 "use codecs.encode() to encode to arbitrary types",
3549 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003550 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003551 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003552 return NULL;
3553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555PyObject *
3556PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003557 const char *encoding,
3558 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003559{
3560 PyObject *v;
3561
3562 if (!PyUnicode_Check(unicode)) {
3563 PyErr_BadArgument();
3564 goto onError;
3565 }
3566
Serhiy Storchaka00939072016-10-27 21:05:49 +03003567 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3568 "PyUnicode_AsEncodedUnicode() is deprecated; "
3569 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3570 return NULL;
3571
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003572 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003574
3575 /* Encode via the codec registry */
3576 v = PyCodec_Encode(unicode, encoding, errors);
3577 if (v == NULL)
3578 goto onError;
3579 if (!PyUnicode_Check(v)) {
3580 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003581 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3582 "use codecs.encode() to encode to arbitrary types",
3583 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003584 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003585 Py_DECREF(v);
3586 goto onError;
3587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003589
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 return NULL;
3592}
3593
Victor Stinner2cba6b82018-01-10 22:46:15 +01003594static PyObject*
3595unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3596 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003597{
Victor Stinner1b579672011-12-17 05:47:23 +01003598 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003599 if (locale_error_handler(errors, &surrogateescape) < 0)
3600 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003601
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003602 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3603 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003604 return NULL;
3605 }
3606
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003607 wchar_t *wstr;
3608 size_t wlen;
3609 const char *reason;
3610 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3611 current_locale, surrogateescape);
3612 if (res != 0) {
3613 if (res == -2) {
3614 PyObject *exc;
3615 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3616 "locale", str, len,
3617 (Py_ssize_t)wlen,
3618 (Py_ssize_t)(wlen + 1),
3619 reason);
3620 if (exc != NULL) {
3621 PyCodec_StrictErrors(exc);
3622 Py_DECREF(exc);
3623 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003624 }
3625 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003626 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003627 }
Victor Stinner2f197072011-12-17 07:08:30 +01003628 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003629 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003630
3631 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3632 PyMem_RawFree(wstr);
3633 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634}
3635
3636PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003637PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3638 const char *errors)
3639{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003640 return unicode_decode_locale(str, len, errors, 1);
3641}
3642
3643PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003644PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003645{
3646 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003647 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003648}
3649
3650
3651PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003652PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003653 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003654 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3655}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003656
Christian Heimes5894ba72007-11-04 11:43:14 +00003657PyObject*
3658PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3659{
Steve Dowercc16be82016-09-08 10:35:16 -07003660#if defined(__APPLE__)
3661 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003662#else
Victor Stinner793b5312011-04-27 00:24:21 +02003663 PyInterpreterState *interp = PyThreadState_GET()->interp;
3664 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3665 cannot use it to encode and decode filenames before it is loaded. Load
3666 the Python codec requires to encode at least its own filename. Use the C
3667 version of the locale codec until the codec registry is initialized and
3668 the Python codec is loaded.
3669
3670 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3671 cannot only rely on it: check also interp->fscodec_initialized for
3672 subinterpreters. */
3673 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003674 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003675 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003676 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003677 }
3678 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003679 return unicode_decode_locale(s, size,
3680 Py_FileSystemDefaultEncodeErrors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003681 }
Victor Stinnerad158722010-10-27 00:25:46 +00003682#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003683}
3684
Martin v. Löwis011e8422009-05-05 04:43:17 +00003685
3686int
3687PyUnicode_FSConverter(PyObject* arg, void* addr)
3688{
Brett Cannonec6ce872016-09-06 15:50:29 -07003689 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003690 PyObject *output = NULL;
3691 Py_ssize_t size;
3692 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003693 if (arg == NULL) {
3694 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003695 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003696 return 1;
3697 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003698 path = PyOS_FSPath(arg);
3699 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003700 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003701 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003702 if (PyBytes_Check(path)) {
3703 output = path;
3704 }
3705 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3706 output = PyUnicode_EncodeFSDefault(path);
3707 Py_DECREF(path);
3708 if (!output) {
3709 return 0;
3710 }
3711 assert(PyBytes_Check(output));
3712 }
3713
Victor Stinner0ea2a462010-04-30 00:22:08 +00003714 size = PyBytes_GET_SIZE(output);
3715 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003716 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003717 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003718 Py_DECREF(output);
3719 return 0;
3720 }
3721 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003722 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003723}
3724
3725
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003726int
3727PyUnicode_FSDecoder(PyObject* arg, void* addr)
3728{
Brett Cannona5711202016-09-06 19:36:01 -07003729 int is_buffer = 0;
3730 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003731 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003732 if (arg == NULL) {
3733 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003734 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003735 return 1;
3736 }
Brett Cannona5711202016-09-06 19:36:01 -07003737
3738 is_buffer = PyObject_CheckBuffer(arg);
3739 if (!is_buffer) {
3740 path = PyOS_FSPath(arg);
3741 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003742 return 0;
3743 }
Brett Cannona5711202016-09-06 19:36:01 -07003744 }
3745 else {
3746 path = arg;
3747 Py_INCREF(arg);
3748 }
3749
3750 if (PyUnicode_Check(path)) {
3751 if (PyUnicode_READY(path) == -1) {
3752 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003753 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003754 }
3755 output = path;
3756 }
3757 else if (PyBytes_Check(path) || is_buffer) {
3758 PyObject *path_bytes = NULL;
3759
3760 if (!PyBytes_Check(path) &&
3761 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3762 "path should be string, bytes, or os.PathLike, not %.200s",
3763 Py_TYPE(arg)->tp_name)) {
3764 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003765 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003766 }
3767 path_bytes = PyBytes_FromObject(path);
3768 Py_DECREF(path);
3769 if (!path_bytes) {
3770 return 0;
3771 }
3772 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3773 PyBytes_GET_SIZE(path_bytes));
3774 Py_DECREF(path_bytes);
3775 if (!output) {
3776 return 0;
3777 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003778 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003779 else {
3780 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003781 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003782 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003783 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003784 return 0;
3785 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003786 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003787 Py_DECREF(output);
3788 return 0;
3789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003791 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003793 Py_DECREF(output);
3794 return 0;
3795 }
3796 *(PyObject**)addr = output;
3797 return Py_CLEANUP_SUPPORTED;
3798}
3799
3800
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003801const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003803{
Christian Heimesf3863112007-11-22 07:46:41 +00003804 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003811 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003813 if (PyUnicode_UTF8(unicode) == NULL) {
3814 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003815 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 if (bytes == NULL)
3817 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3819 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003820 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 Py_DECREF(bytes);
3822 return NULL;
3823 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003825 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 PyBytes_AS_STRING(bytes),
3827 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 Py_DECREF(bytes);
3829 }
3830
3831 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003832 *psize = PyUnicode_UTF8_LENGTH(unicode);
3833 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003834}
3835
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003836const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3840}
3841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842Py_UNICODE *
3843PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 const unsigned char *one_byte;
3846#if SIZEOF_WCHAR_T == 4
3847 const Py_UCS2 *two_bytes;
3848#else
3849 const Py_UCS4 *four_bytes;
3850 const Py_UCS4 *ucs4_end;
3851 Py_ssize_t num_surrogates;
3852#endif
3853 wchar_t *w;
3854 wchar_t *wchar_end;
3855
3856 if (!PyUnicode_Check(unicode)) {
3857 PyErr_BadArgument();
3858 return NULL;
3859 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003862 assert(_PyUnicode_KIND(unicode) != 0);
3863 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003865 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3868 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 num_surrogates = 0;
3870
3871 for (; four_bytes < ucs4_end; ++four_bytes) {
3872 if (*four_bytes > 0xFFFF)
3873 ++num_surrogates;
3874 }
3875
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3877 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3878 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 PyErr_NoMemory();
3880 return NULL;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 w = _PyUnicode_WSTR(unicode);
3885 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3886 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3888 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003889 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003891 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3892 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 }
3894 else
3895 *w = *four_bytes;
3896
3897 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003898 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 }
3900 }
3901 *w = 0;
3902#else
3903 /* sizeof(wchar_t) == 4 */
3904 Py_FatalError("Impossible unicode object state, wstr and str "
3905 "should share memory already.");
3906 return NULL;
3907#endif
3908 }
3909 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003910 if ((size_t)_PyUnicode_LENGTH(unicode) >
3911 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3912 PyErr_NoMemory();
3913 return NULL;
3914 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003915 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3916 (_PyUnicode_LENGTH(unicode) + 1));
3917 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 PyErr_NoMemory();
3919 return NULL;
3920 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3922 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3923 w = _PyUnicode_WSTR(unicode);
3924 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003926 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3927 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 for (; w < wchar_end; ++one_byte, ++w)
3929 *w = *one_byte;
3930 /* null-terminate the wstr */
3931 *w = 0;
3932 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003933 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003935 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 for (; w < wchar_end; ++two_bytes, ++w)
3937 *w = *two_bytes;
3938 /* null-terminate the wstr */
3939 *w = 0;
3940#else
3941 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003942 PyObject_FREE(_PyUnicode_WSTR(unicode));
3943 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 Py_FatalError("Impossible unicode object state, wstr "
3945 "and str should share memory already.");
3946 return NULL;
3947#endif
3948 }
3949 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003950 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 }
3952 }
3953 }
3954 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 *size = PyUnicode_WSTR_LENGTH(unicode);
3956 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003957}
3958
Alexander Belopolsky40018472011-02-26 01:02:56 +00003959Py_UNICODE *
3960PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963}
3964
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003965const Py_UNICODE *
3966_PyUnicode_AsUnicode(PyObject *unicode)
3967{
3968 Py_ssize_t size;
3969 const Py_UNICODE *wstr;
3970
3971 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3972 if (wstr && wcslen(wstr) != (size_t)size) {
3973 PyErr_SetString(PyExc_ValueError, "embedded null character");
3974 return NULL;
3975 }
3976 return wstr;
3977}
3978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Alexander Belopolsky40018472011-02-26 01:02:56 +00003980Py_ssize_t
3981PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982{
3983 if (!PyUnicode_Check(unicode)) {
3984 PyErr_BadArgument();
3985 goto onError;
3986 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003987 if (_PyUnicode_WSTR(unicode) == NULL) {
3988 if (PyUnicode_AsUnicode(unicode) == NULL)
3989 goto onError;
3990 }
3991 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 return -1;
3995}
3996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997Py_ssize_t
3998PyUnicode_GetLength(PyObject *unicode)
3999{
Victor Stinner07621332012-06-16 04:53:46 +02004000 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 PyErr_BadArgument();
4002 return -1;
4003 }
Victor Stinner07621332012-06-16 04:53:46 +02004004 if (PyUnicode_READY(unicode) == -1)
4005 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return PyUnicode_GET_LENGTH(unicode);
4007}
4008
4009Py_UCS4
4010PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4011{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004012 void *data;
4013 int kind;
4014
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004015 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004016 PyErr_BadArgument();
4017 return (Py_UCS4)-1;
4018 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004019 if (PyUnicode_READY(unicode) == -1) {
4020 return (Py_UCS4)-1;
4021 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004022 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004023 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 return (Py_UCS4)-1;
4025 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004026 data = PyUnicode_DATA(unicode);
4027 kind = PyUnicode_KIND(unicode);
4028 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029}
4030
4031int
4032PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4033{
4034 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004035 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 return -1;
4037 }
Victor Stinner488fa492011-12-12 00:01:39 +01004038 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004039 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004040 PyErr_SetString(PyExc_IndexError, "string index out of range");
4041 return -1;
4042 }
Victor Stinner488fa492011-12-12 00:01:39 +01004043 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004044 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004045 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4046 PyErr_SetString(PyExc_ValueError, "character out of range");
4047 return -1;
4048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4050 index, ch);
4051 return 0;
4052}
4053
Alexander Belopolsky40018472011-02-26 01:02:56 +00004054const char *
4055PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004056{
Victor Stinner42cb4622010-09-01 19:39:01 +00004057 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004058}
4059
Victor Stinner554f3f02010-06-16 23:33:54 +00004060/* create or adjust a UnicodeDecodeError */
4061static void
4062make_decode_exception(PyObject **exceptionObject,
4063 const char *encoding,
4064 const char *input, Py_ssize_t length,
4065 Py_ssize_t startpos, Py_ssize_t endpos,
4066 const char *reason)
4067{
4068 if (*exceptionObject == NULL) {
4069 *exceptionObject = PyUnicodeDecodeError_Create(
4070 encoding, input, length, startpos, endpos, reason);
4071 }
4072 else {
4073 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4074 goto onError;
4075 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4076 goto onError;
4077 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4078 goto onError;
4079 }
4080 return;
4081
4082onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004083 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004084}
4085
Steve Dowercc16be82016-09-08 10:35:16 -07004086#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087/* error handling callback helper:
4088 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004089 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 and adjust various state variables.
4091 return 0 on success, -1 on error
4092*/
4093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004095unicode_decode_call_errorhandler_wchar(
4096 const char *errors, PyObject **errorHandler,
4097 const char *encoding, const char *reason,
4098 const char **input, const char **inend, Py_ssize_t *startinpos,
4099 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4100 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004102 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103
4104 PyObject *restuple = NULL;
4105 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004106 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004108 Py_ssize_t requiredsize;
4109 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004110 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004111 wchar_t *repwstr;
4112 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004114 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4115 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 *errorHandler = PyCodec_LookupError(errors);
4119 if (*errorHandler == NULL)
4120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 }
4122
Victor Stinner554f3f02010-06-16 23:33:54 +00004123 make_decode_exception(exceptionObject,
4124 encoding,
4125 *input, *inend - *input,
4126 *startinpos, *endinpos,
4127 reason);
4128 if (*exceptionObject == NULL)
4129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004131 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004135 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004138 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004140
4141 /* Copy back the bytes variables, which might have been modified by the
4142 callback */
4143 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4144 if (!inputobj)
4145 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146 *input = PyBytes_AS_STRING(inputobj);
4147 insize = PyBytes_GET_SIZE(inputobj);
4148 *inend = *input + insize;
4149 /* we can DECREF safely, as the exception has another reference,
4150 so the object won't go away. */
4151 Py_DECREF(inputobj);
4152
4153 if (newpos<0)
4154 newpos = insize+newpos;
4155 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004156 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004157 goto onError;
4158 }
4159
4160 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4161 if (repwstr == NULL)
4162 goto onError;
4163 /* need more space? (at least enough for what we
4164 have+the replacement+the rest of the string (starting
4165 at the new input position), so we won't have to check space
4166 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004167 requiredsize = *outpos;
4168 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4169 goto overflow;
4170 requiredsize += repwlen;
4171 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4172 goto overflow;
4173 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004175 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 requiredsize = 2*outsize;
4177 if (unicode_resize(output, requiredsize) < 0)
4178 goto onError;
4179 }
4180 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4181 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 *endinpos = newpos;
4183 *inptr = *input + newpos;
4184
4185 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004186 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004187 return 0;
4188
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004189 overflow:
4190 PyErr_SetString(PyExc_OverflowError,
4191 "decoded result is too long for a Python string");
4192
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 onError:
4194 Py_XDECREF(restuple);
4195 return -1;
4196}
Steve Dowercc16be82016-09-08 10:35:16 -07004197#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198
4199static int
4200unicode_decode_call_errorhandler_writer(
4201 const char *errors, PyObject **errorHandler,
4202 const char *encoding, const char *reason,
4203 const char **input, const char **inend, Py_ssize_t *startinpos,
4204 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4205 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4206{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004207 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004208
4209 PyObject *restuple = NULL;
4210 PyObject *repunicode = NULL;
4211 Py_ssize_t insize;
4212 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004213 Py_ssize_t replen;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004214 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 PyObject *inputobj = NULL;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004216 int need_to_grow = 0;
4217 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218
4219 if (*errorHandler == NULL) {
4220 *errorHandler = PyCodec_LookupError(errors);
4221 if (*errorHandler == NULL)
4222 goto onError;
4223 }
4224
4225 make_decode_exception(exceptionObject,
4226 encoding,
4227 *input, *inend - *input,
4228 *startinpos, *endinpos,
4229 reason);
4230 if (*exceptionObject == NULL)
4231 goto onError;
4232
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004233 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 if (restuple == NULL)
4235 goto onError;
4236 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004237 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004238 goto onError;
4239 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004240 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004241 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004242
4243 /* Copy back the bytes variables, which might have been modified by the
4244 callback */
4245 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4246 if (!inputobj)
4247 goto onError;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004248 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004249 *input = PyBytes_AS_STRING(inputobj);
4250 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004251 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004252 /* we can DECREF safely, as the exception has another reference,
4253 so the object won't go away. */
4254 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004258 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004259 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262
Victor Stinner170ca6f2013-04-18 00:25:28 +02004263 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004264 if (replen > 1) {
4265 writer->min_length += replen - 1;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004266 need_to_grow = 1;
4267 }
4268 new_inptr = *input + newpos;
4269 if (*inend - new_inptr > remain) {
4270 /* We don't know the decoding algorithm here so we make the worst
4271 assumption that one byte decodes to one unicode character.
4272 If unfortunately one byte could decode to more unicode characters,
4273 the decoder may write out-of-bound then. Is it possible for the
4274 algorithms using this function? */
4275 writer->min_length += *inend - new_inptr - remain;
4276 need_to_grow = 1;
4277 }
4278 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004279 writer->overallocate = 1;
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08004280 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004281 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4282 goto onError;
4283 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004285 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 *endinpos = newpos;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004288 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004291 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297}
4298
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299/* --- UTF-7 Codec -------------------------------------------------------- */
4300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301/* See RFC2152 for details. We encode conservatively and decode liberally. */
4302
4303/* Three simple macros defining base-64. */
4304
4305/* Is c a base-64 character? */
4306
4307#define IS_BASE64(c) \
4308 (((c) >= 'A' && (c) <= 'Z') || \
4309 ((c) >= 'a' && (c) <= 'z') || \
4310 ((c) >= '0' && (c) <= '9') || \
4311 (c) == '+' || (c) == '/')
4312
4313/* given that c is a base-64 character, what is its base-64 value? */
4314
4315#define FROM_BASE64(c) \
4316 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4317 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4318 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4319 (c) == '+' ? 62 : 63)
4320
4321/* What is the base-64 character of the bottom 6 bits of n? */
4322
4323#define TO_BASE64(n) \
4324 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4325
4326/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4327 * decoded as itself. We are permissive on decoding; the only ASCII
4328 * byte not decoding to itself is the + which begins a base64
4329 * string. */
4330
4331#define DECODE_DIRECT(c) \
4332 ((c) <= 127 && (c) != '+')
4333
4334/* The UTF-7 encoder treats ASCII characters differently according to
4335 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4336 * the above). See RFC2152. This array identifies these different
4337 * sets:
4338 * 0 : "Set D"
4339 * alphanumeric and '(),-./:?
4340 * 1 : "Set O"
4341 * !"#$%&*;<=>@[]^_`{|}
4342 * 2 : "whitespace"
4343 * ht nl cr sp
4344 * 3 : special (must be base64 encoded)
4345 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4346 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347
Tim Petersced69f82003-09-16 20:30:58 +00004348static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349char utf7_category[128] = {
4350/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4351 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4352/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4353 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4354/* sp ! " # $ % & ' ( ) * + , - . / */
4355 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4356/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4358/* @ A B C D E F G H I J K L M N O */
4359 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4360/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4362/* ` a b c d e f g h i j k l m n o */
4363 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4364/* p q r s t u v w x y z { | } ~ del */
4365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366};
4367
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368/* ENCODE_DIRECT: this character should be encoded as itself. The
4369 * answer depends on whether we are encoding set O as itself, and also
4370 * on whether we are encoding whitespace as itself. RFC2152 makes it
4371 * clear that the answers to these questions vary between
4372 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374#define ENCODE_DIRECT(c, directO, directWS) \
4375 ((c) < 128 && (c) > 0 && \
4376 ((utf7_category[(c)] == 0) || \
4377 (directWS && (utf7_category[(c)] == 2)) || \
4378 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379
Alexander Belopolsky40018472011-02-26 01:02:56 +00004380PyObject *
4381PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004382 Py_ssize_t size,
4383 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004385 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4386}
4387
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388/* The decoder. The only state we preserve is our read position,
4389 * i.e. how many characters we have consumed. So if we end in the
4390 * middle of a shift sequence we have to back off the read position
4391 * and the output to the beginning of the sequence, otherwise we lose
4392 * all the shift state (seen bits, number of bits seen, high
4393 * surrogate). */
4394
Alexander Belopolsky40018472011-02-26 01:02:56 +00004395PyObject *
4396PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004397 Py_ssize_t size,
4398 const char *errors,
4399 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004400{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t startinpos;
4403 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004405 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 const char *errmsg = "";
4407 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 unsigned int base64bits = 0;
4410 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004411 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 PyObject *errorHandler = NULL;
4413 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004415 if (size == 0) {
4416 if (consumed)
4417 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004418 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004419 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004422 _PyUnicodeWriter_Init(&writer);
4423 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424
4425 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 e = s + size;
4427
4428 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004429 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004431 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 if (inShift) { /* in a base-64 section */
4434 if (IS_BASE64(ch)) { /* consume a base-64 character */
4435 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4436 base64bits += 6;
4437 s++;
4438 if (base64bits >= 16) {
4439 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004440 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 base64bits -= 16;
4442 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004443 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 if (surrogate) {
4445 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004446 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4447 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004448 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004451 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
4453 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004454 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004455 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 }
4458 }
Victor Stinner551ac952011-11-29 22:58:13 +01004459 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 /* first surrogate */
4461 surrogate = outCh;
4462 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004464 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 }
4467 }
4468 }
4469 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 if (base64bits > 0) { /* left-over bits */
4472 if (base64bits >= 6) {
4473 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004474 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 errmsg = "partial character in shift sequence";
4476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 else {
4479 /* Some bits remain; they should be zero */
4480 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004481 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 errmsg = "non-zero padding bits in shift sequence";
4483 goto utf7Error;
4484 }
4485 }
4486 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004487 if (surrogate && DECODE_DIRECT(ch)) {
4488 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4489 goto onError;
4490 }
4491 surrogate = 0;
4492 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 /* '-' is absorbed; other terminating
4494 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004495 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497 }
4498 }
4499 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 s++; /* consume '+' */
4502 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004504 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004505 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 }
4507 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004509 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004512 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 }
4514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 else {
4521 startinpos = s-starts;
4522 s++;
4523 errmsg = "unexpected special character";
4524 goto utf7Error;
4525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 errors, &errorHandler,
4531 "utf7", errmsg,
4532 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* end of string */
4538
4539 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4540 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (surrogate ||
4543 (base64bits >= 6) ||
4544 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 errors, &errorHandler,
4548 "utf7", "unterminated shift sequence",
4549 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 goto onError;
4552 if (s < e)
4553 goto restart;
4554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556
4557 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004558 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004561 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004562 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004563 writer.kind, writer.data, shiftOutStart);
4564 Py_XDECREF(errorHandler);
4565 Py_XDECREF(exc);
4566 _PyUnicodeWriter_Dealloc(&writer);
4567 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_XDECREF(errorHandler);
4582 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 return NULL;
4585}
4586
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589_PyUnicode_EncodeUTF7(PyObject *str,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004594 int kind;
4595 void *data;
4596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 char * out;
4603 char * start;
4604
Benjamin Petersonbac79492012-01-14 13:34:47 -05004605 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004606 return NULL;
4607 kind = PyUnicode_KIND(str);
4608 data = PyUnicode_DATA(str);
4609 len = PyUnicode_GET_LENGTH(str);
4610
4611 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004615 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004616 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004617 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 if (v == NULL)
4619 return NULL;
4620
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (inShift) {
4626 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4627 /* shifting out */
4628 if (base64bits) { /* output remaining bits */
4629 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4630 base64buffer = 0;
4631 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
4633 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* Characters not in the BASE64 set implicitly unshift the sequence
4635 so no '-' is required, except if the character is itself a '-' */
4636 if (IS_BASE64(ch) || ch == '-') {
4637 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 *out++ = (char) ch;
4640 }
4641 else {
4642 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* not in a shift sequence */
4646 if (ch == '+') {
4647 *out++ = '+';
4648 *out++ = '-';
4649 }
4650 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 *out++ = '+';
4655 inShift = 1;
4656 goto encode_char;
4657 }
4658 }
4659 continue;
4660encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004662 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* code first surrogate */
4665 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004666 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 while (base64bits >= 6) {
4668 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4669 base64bits -= 6;
4670 }
4671 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004672 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 base64bits += 16;
4675 base64buffer = (base64buffer << 16) | ch;
4676 while (base64bits >= 6) {
4677 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4678 base64bits -= 6;
4679 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 if (base64bits)
4682 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4683 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 if (_PyBytes_Resize(&v, out - start) < 0)
4686 return NULL;
4687 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004689PyObject *
4690PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4691 Py_ssize_t size,
4692 int base64SetO,
4693 int base64WhiteSpace,
4694 const char *errors)
4695{
4696 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004697 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004698 if (tmp == NULL)
4699 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004700 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004701 base64WhiteSpace, errors);
4702 Py_DECREF(tmp);
4703 return result;
4704}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706#undef IS_BASE64
4707#undef FROM_BASE64
4708#undef TO_BASE64
4709#undef DECODE_DIRECT
4710#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712/* --- UTF-8 Codec -------------------------------------------------------- */
4713
Alexander Belopolsky40018472011-02-26 01:02:56 +00004714PyObject *
4715PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004716 Py_ssize_t size,
4717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Walter Dörwald69652032004-09-07 20:24:22 +00004719 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4720}
4721
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722#include "stringlib/asciilib.h"
4723#include "stringlib/codecs.h"
4724#include "stringlib/undef.h"
4725
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726#include "stringlib/ucs1lib.h"
4727#include "stringlib/codecs.h"
4728#include "stringlib/undef.h"
4729
4730#include "stringlib/ucs2lib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
4734#include "stringlib/ucs4lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
Antoine Pitrouab868312009-01-10 15:40:25 +00004738/* Mask to quickly check whether a C 'long' contains a
4739 non-ASCII, UTF8-encoded char. */
4740#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004741# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004742#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004743# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004744#else
4745# error C 'long' size should be either 4 or 8!
4746#endif
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748static Py_ssize_t
4749ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004752 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004754 /*
4755 * Issue #17237: m68k is a bit different from most architectures in
4756 * that objects do not use "natural alignment" - for example, int and
4757 * long are only aligned at 2-byte boundaries. Therefore the assert()
4758 * won't work; also, tests have shown that skipping the "optimised
4759 * version" will even speed up m68k.
4760 */
4761#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004763 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4764 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 /* Fast path, see in STRINGLIB(utf8_decode) for
4766 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004767 /* Help allocation */
4768 const char *_p = p;
4769 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 while (_p < aligned_end) {
4771 unsigned long value = *(const unsigned long *) _p;
4772 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 *((unsigned long *)q) = value;
4775 _p += SIZEOF_LONG;
4776 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 p = _p;
4779 while (p < end) {
4780 if ((unsigned char)*p & 0x80)
4781 break;
4782 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004787#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 while (p < end) {
4789 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4790 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004791 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004792 /* Help allocation */
4793 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
4797 break;
4798 _p += SIZEOF_LONG;
4799 }
4800 p = _p;
4801 if (_p == end)
4802 break;
4803 }
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 ++p;
4807 }
4808 memcpy(dest, start, p - start);
4809 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
Antoine Pitrouab868312009-01-10 15:40:25 +00004811
Victor Stinner785938e2011-12-11 20:09:03 +01004812PyObject *
4813PyUnicode_DecodeUTF8Stateful(const char *s,
4814 Py_ssize_t size,
4815 const char *errors,
4816 Py_ssize_t *consumed)
4817{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004818 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004819 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821
4822 Py_ssize_t startinpos;
4823 Py_ssize_t endinpos;
4824 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004825 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004827 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004828
4829 if (size == 0) {
4830 if (consumed)
4831 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004833 }
4834
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4836 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004837 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 *consumed = 1;
4839 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004840 }
4841
Victor Stinner8f674cc2013-04-17 23:02:17 +02004842 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004843 writer.min_length = size;
4844 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004846
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004847 writer.pos = ascii_decode(s, end, writer.data);
4848 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 while (s < end) {
4850 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004852
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 if (PyUnicode_IS_ASCII(writer.buffer))
4855 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 } else {
4861 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 }
4864
4865 switch (ch) {
4866 case 0:
4867 if (s == end || consumed)
4868 goto End;
4869 errmsg = "unexpected end of data";
4870 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004871 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 break;
4873 case 1:
4874 errmsg = "invalid start byte";
4875 startinpos = s - starts;
4876 endinpos = startinpos + 1;
4877 break;
4878 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 case 3:
4880 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 errmsg = "invalid continuation byte";
4882 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004883 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 break;
4885 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004886 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 goto onError;
4888 continue;
4889 }
4890
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 if (error_handler == _Py_ERROR_UNKNOWN)
4892 error_handler = get_error_handler(errors);
4893
4894 switch (error_handler) {
4895 case _Py_ERROR_IGNORE:
4896 s += (endinpos - startinpos);
4897 break;
4898
4899 case _Py_ERROR_REPLACE:
4900 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4901 goto onError;
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004906 {
4907 Py_ssize_t i;
4908
Victor Stinner1d65d912015-10-05 13:43:50 +02004909 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4910 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004911 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004912 ch = (Py_UCS4)(unsigned char)(starts[i]);
4913 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4914 ch + 0xdc00);
4915 writer.pos++;
4916 }
4917 s += (endinpos - startinpos);
4918 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004919 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004920
4921 default:
4922 if (unicode_decode_call_errorhandler_writer(
4923 errors, &error_handler_obj,
4924 "utf-8", errmsg,
4925 &starts, &end, &startinpos, &endinpos, &exc, &s,
4926 &writer))
4927 goto onError;
4928 }
Victor Stinner785938e2011-12-11 20:09:03 +01004929 }
4930
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 if (consumed)
4933 *consumed = s - starts;
4934
Victor Stinner1d65d912015-10-05 13:43:50 +02004935 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004937 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938
4939onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004940 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004942 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004944}
4945
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004947/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4948 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004949
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004950 On success, write a pointer to a newly allocated wide character string into
4951 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4952 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 On memory allocation failure, return -1.
4955
4956 On decoding error (if surrogateescape is zero), return -2. If wlen is
4957 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4958 is not NULL, write the decoding error message into *reason. */
4959int
4960_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4961 const char **reason, int surrogateescape)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004962{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004963 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004964 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 wchar_t *unicode;
4966 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004967
4968 /* Note: size will always be longer than the resulting Unicode
4969 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004970 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004971 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004972 }
4973
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004974 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004975 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004976 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004977 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004978
4979 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004980 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004982 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004986#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 if (ch > 0xFF) {
4990#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004991 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004992#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004993 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004994 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4996 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4997#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 else {
5000 if (!ch && s == e)
5001 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005002 if (!surrogateescape) {
5003 PyMem_RawFree(unicode );
5004 if (reason != NULL) {
5005 switch (ch) {
5006 case 0:
5007 *reason = "unexpected end of data";
5008 break;
5009 case 1:
5010 *reason = "invalid start byte";
5011 break;
5012 /* 2, 3, 4 */
5013 default:
5014 *reason = "invalid continuation byte";
5015 break;
5016 }
5017 }
5018 if (wlen != NULL) {
5019 *wlen = s - orig_s;
5020 }
5021 return -2;
5022 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 /* surrogateescape */
5024 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5025 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005026 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 if (wlen) {
5029 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005030 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005031 *wstr = unicode;
5032 return 0;
5033}
5034
5035wchar_t*
5036_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5037{
5038 wchar_t *wstr;
5039 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5040 if (res != 0) {
5041 return NULL;
5042 }
5043 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005044}
5045
Antoine Pitrouab868312009-01-10 15:40:25 +00005046
Victor Stinnere47e6982017-12-21 15:45:16 +01005047/* UTF-8 encoder using the surrogateescape error handler .
5048
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005049 On success, return 0 and write the newly allocated character string (use
5050 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005051
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005052 On encoding failure, return -2 and write the position of the invalid
5053 surrogate character into *error_pos (if error_pos is set) and the decoding
5054 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005055
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005056 On memory allocation failure, return -1. */
5057int
5058_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5059 const char **reason, int raw_malloc, int surrogateescape)
Victor Stinnere47e6982017-12-21 15:45:16 +01005060{
5061 const Py_ssize_t max_char_size = 4;
5062 Py_ssize_t len = wcslen(text);
5063
5064 assert(len >= 0);
5065
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5067 return -1;
5068 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005069 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005070 if (raw_malloc) {
5071 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005072 }
5073 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005074 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005075 }
5076 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005077 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005078 }
5079
5080 char *p = bytes;
5081 Py_ssize_t i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005082 for (i = 0; i < len; i++) {
5083 Py_UCS4 ch = text[i];
Victor Stinnere47e6982017-12-21 15:45:16 +01005084
5085 if (ch < 0x80) {
5086 /* Encode ASCII */
5087 *p++ = (char) ch;
5088
5089 }
5090 else if (ch < 0x0800) {
5091 /* Encode Latin-1 */
5092 *p++ = (char)(0xc0 | (ch >> 6));
5093 *p++ = (char)(0x80 | (ch & 0x3f));
5094 }
5095 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5096 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005097 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005098 if (error_pos != NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005099 *error_pos = (size_t)i;
Victor Stinnere47e6982017-12-21 15:45:16 +01005100 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005101 if (reason != NULL) {
5102 *reason = "encoding error";
5103 }
5104 if (raw_malloc) {
5105 PyMem_RawFree(bytes);
5106 }
5107 else {
5108 PyMem_Free(bytes);
5109 }
5110 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005111 }
5112 *p++ = (char)(ch & 0xff);
5113 }
5114 else if (ch < 0x10000) {
5115 *p++ = (char)(0xe0 | (ch >> 12));
5116 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5117 *p++ = (char)(0x80 | (ch & 0x3f));
5118 }
5119 else { /* ch >= 0x10000 */
5120 assert(ch <= MAX_UNICODE);
5121 /* Encode UCS4 Unicode ordinals */
5122 *p++ = (char)(0xf0 | (ch >> 18));
5123 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5124 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5125 *p++ = (char)(0x80 | (ch & 0x3f));
5126 }
5127 }
5128 *p++ = '\0';
5129
5130 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005131 char *bytes2;
5132 if (raw_malloc) {
5133 bytes2 = PyMem_RawRealloc(bytes, final_size);
5134 }
5135 else {
5136 bytes2 = PyMem_Realloc(bytes, final_size);
5137 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005138 if (bytes2 == NULL) {
5139 if (error_pos != NULL) {
5140 *error_pos = (size_t)-1;
5141 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 if (raw_malloc) {
5143 PyMem_RawFree(bytes);
5144 }
5145 else {
5146 PyMem_Free(bytes);
5147 }
5148 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005149 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 *str = bytes2;
5151 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005152}
5153
5154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155/* Primary internal function which creates utf8 encoded bytes objects.
5156
5157 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005158 and allocate exactly as much space needed at the end. Else allocate the
5159 maximum possible needed (4 result bytes per Unicode character), and return
5160 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005161*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005162PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005163_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Victor Stinner6099a032011-12-18 14:22:26 +01005165 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 void *data;
5167 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169 if (!PyUnicode_Check(unicode)) {
5170 PyErr_BadArgument();
5171 return NULL;
5172 }
5173
5174 if (PyUnicode_READY(unicode) == -1)
5175 return NULL;
5176
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005177 if (PyUnicode_UTF8(unicode))
5178 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5179 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005180
5181 kind = PyUnicode_KIND(unicode);
5182 data = PyUnicode_DATA(unicode);
5183 size = PyUnicode_GET_LENGTH(unicode);
5184
Benjamin Petersonead6b532011-12-20 17:23:42 -06005185 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005187 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005188 case PyUnicode_1BYTE_KIND:
5189 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5190 assert(!PyUnicode_IS_ASCII(unicode));
5191 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_2BYTE_KIND:
5193 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5194 case PyUnicode_4BYTE_KIND:
5195 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197}
5198
Alexander Belopolsky40018472011-02-26 01:02:56 +00005199PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005200PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5201 Py_ssize_t size,
5202 const char *errors)
5203{
5204 PyObject *v, *unicode;
5205
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005206 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207 if (unicode == NULL)
5208 return NULL;
5209 v = _PyUnicode_AsUTF8String(unicode, errors);
5210 Py_DECREF(unicode);
5211 return v;
5212}
5213
5214PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005215PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218}
5219
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220/* --- UTF-32 Codec ------------------------------------------------------- */
5221
5222PyObject *
5223PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 Py_ssize_t size,
5225 const char *errors,
5226 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227{
5228 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5229}
5230
5231PyObject *
5232PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 Py_ssize_t size,
5234 const char *errors,
5235 int *byteorder,
5236 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237{
5238 const char *starts = s;
5239 Py_ssize_t startinpos;
5240 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005242 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005244 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005248
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249 q = (unsigned char *)s;
5250 e = q + size;
5251
5252 if (byteorder)
5253 bo = *byteorder;
5254
5255 /* Check for BOM marks (U+FEFF) in the input and adjust current
5256 byte order setting accordingly. In native mode, the leading BOM
5257 mark is skipped, in all other modes, it is copied to the output
5258 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005259 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005260 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 if (bom == 0x0000FEFF) {
5262 bo = -1;
5263 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005265 else if (bom == 0xFFFE0000) {
5266 bo = 1;
5267 q += 4;
5268 }
5269 if (byteorder)
5270 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271 }
5272
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 if (q == e) {
5274 if (consumed)
5275 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005276 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277 }
5278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279#ifdef WORDS_BIGENDIAN
5280 le = bo < 0;
5281#else
5282 le = bo <= 0;
5283#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005284 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005285
Victor Stinner8f674cc2013-04-17 23:02:17 +02005286 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005287 writer.min_length = (e - q + 3) / 4;
5288 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005289 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005290
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 while (1) {
5292 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005293 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005294
Victor Stinnere64322e2012-10-30 23:12:47 +01005295 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 enum PyUnicode_Kind kind = writer.kind;
5297 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005300 if (le) {
5301 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005302 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (ch > maxch)
5304 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 if (kind != PyUnicode_1BYTE_KIND &&
5306 Py_UNICODE_IS_SURROGATE(ch))
5307 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 q += 4;
5310 } while (q <= last);
5311 }
5312 else {
5313 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005314 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 if (ch > maxch)
5316 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005317 if (kind != PyUnicode_1BYTE_KIND &&
5318 Py_UNICODE_IS_SURROGATE(ch))
5319 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 q += 4;
5322 } while (q <= last);
5323 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005324 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 }
5326
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005328 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005329 startinpos = ((const char *)q) - starts;
5330 endinpos = startinpos + 4;
5331 }
5332 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005337 startinpos = ((const char *)q) - starts;
5338 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 else {
5341 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005342 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 goto onError;
5344 q += 4;
5345 continue;
5346 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005347 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005348 startinpos = ((const char *)q) - starts;
5349 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005351
5352 /* The remaining input chars are ignored if the callback
5353 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005356 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 }
5361
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 Py_XDECREF(errorHandler);
5366 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
5373 return NULL;
5374}
5375
5376PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005377_PyUnicode_EncodeUTF32(PyObject *str,
5378 const char *errors,
5379 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005381 enum PyUnicode_Kind kind;
5382 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005384 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005385 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005386#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005389 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005391 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005393 PyObject *errorHandler = NULL;
5394 PyObject *exc = NULL;
5395 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005397 if (!PyUnicode_Check(str)) {
5398 PyErr_BadArgument();
5399 return NULL;
5400 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005401 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 return NULL;
5403 kind = PyUnicode_KIND(str);
5404 data = PyUnicode_DATA(str);
5405 len = PyUnicode_GET_LENGTH(str);
5406
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005408 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005410 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411 if (v == NULL)
5412 return NULL;
5413
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 /* output buffer is 4-bytes aligned */
5415 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005416 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 else
5427 encoding = "utf-32";
5428
5429 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5431 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005432 }
5433
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 pos = 0;
5435 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005436 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437
5438 if (kind == PyUnicode_2BYTE_KIND) {
5439 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5440 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442 else {
5443 assert(kind == PyUnicode_4BYTE_KIND);
5444 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5445 &out, native_ordering);
5446 }
5447 if (pos == len)
5448 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005449
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 rep = unicode_encode_call_errorhandler(
5451 errors, &errorHandler,
5452 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005453 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005454 if (!rep)
5455 goto error;
5456
5457 if (PyBytes_Check(rep)) {
5458 repsize = PyBytes_GET_SIZE(rep);
5459 if (repsize & 3) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 moreunits = repsize / 4;
5466 }
5467 else {
5468 assert(PyUnicode_Check(rep));
5469 if (PyUnicode_READY(rep) < 0)
5470 goto error;
5471 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5472 if (!PyUnicode_IS_ASCII(rep)) {
5473 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005474 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005475 "surrogates not allowed");
5476 goto error;
5477 }
5478 }
5479
5480 /* four bytes are reserved for each surrogate */
5481 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005482 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005483 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 /* integer overflow */
5485 PyErr_NoMemory();
5486 goto error;
5487 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005488 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005490 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 }
5492
5493 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005494 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5499 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 }
5501
5502 Py_CLEAR(rep);
5503 }
5504
5505 /* Cut back to size actually needed. This is necessary for, for example,
5506 encoding of a string containing isolated surrogates and the 'ignore'
5507 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005508 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 if (nsize != PyBytes_GET_SIZE(v))
5510 _PyBytes_Resize(&v, nsize);
5511 Py_XDECREF(errorHandler);
5512 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005513 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005514 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 error:
5516 Py_XDECREF(rep);
5517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
5519 Py_XDECREF(v);
5520 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521}
5522
Alexander Belopolsky40018472011-02-26 01:02:56 +00005523PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005524PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5525 Py_ssize_t size,
5526 const char *errors,
5527 int byteorder)
5528{
5529 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005530 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531 if (tmp == NULL)
5532 return NULL;
5533 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5534 Py_DECREF(tmp);
5535 return result;
5536}
5537
5538PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005539PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540{
Victor Stinnerb960b342011-11-20 19:12:52 +01005541 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542}
5543
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544/* --- UTF-16 Codec ------------------------------------------------------- */
5545
Tim Peters772747b2001-08-09 22:21:55 +00005546PyObject *
5547PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 Py_ssize_t size,
5549 const char *errors,
5550 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551{
Walter Dörwald69652032004-09-07 20:24:22 +00005552 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5553}
5554
5555PyObject *
5556PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 Py_ssize_t size,
5558 const char *errors,
5559 int *byteorder,
5560 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t startinpos;
5564 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005565 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005567 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 PyObject *errorHandler = NULL;
5571 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005572 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Tim Peters772747b2001-08-09 22:21:55 +00005574 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005575 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
5577 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005578 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005580 /* Check for BOM marks (U+FEFF) in the input and adjust current
5581 byte order setting accordingly. In native mode, the leading BOM
5582 mark is skipped, in all other modes, it is copied to the output
5583 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 if (bo == 0 && size >= 2) {
5585 const Py_UCS4 bom = (q[1] << 8) | q[0];
5586 if (bom == 0xFEFF) {
5587 q += 2;
5588 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 else if (bom == 0xFFFE) {
5591 q += 2;
5592 bo = 1;
5593 }
5594 if (byteorder)
5595 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
Antoine Pitrou63065d72012-05-15 23:48:04 +02005598 if (q == e) {
5599 if (consumed)
5600 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005601 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005602 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603
Christian Heimes743e0cd2012-10-17 23:52:17 +02005604#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005607#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005609 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005610#endif
Tim Peters772747b2001-08-09 22:21:55 +00005611
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang86fdad02018-01-31 20:48:05 +08005613 character count normally. Error handler will take care of
5614 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005615 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005616 writer.min_length = (e - q + 1) / 2;
5617 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 while (1) {
5621 Py_UCS4 ch = 0;
5622 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
5629 else
5630 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 native_ordering);
5633 } else if (kind == PyUnicode_2BYTE_KIND) {
5634 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 native_ordering);
5637 } else {
5638 assert(kind == PyUnicode_4BYTE_KIND);
5639 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005640 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005641 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005642 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644
Antoine Pitrou63065d72012-05-15 23:48:04 +02005645 switch (ch)
5646 {
5647 case 0:
5648 /* remaining byte at the end? (size should be even) */
5649 if (q == e || consumed)
5650 goto End;
5651 errmsg = "truncated data";
5652 startinpos = ((const char *)q) - starts;
5653 endinpos = ((const char *)e) - starts;
5654 break;
5655 /* The remaining input chars are ignored if the callback
5656 chooses to skip the input */
5657 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005658 q -= 2;
5659 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005660 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005662 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005663 endinpos = ((const char *)e) - starts;
5664 break;
5665 case 2:
5666 errmsg = "illegal encoding";
5667 startinpos = ((const char *)q) - 2 - starts;
5668 endinpos = startinpos + 2;
5669 break;
5670 case 3:
5671 errmsg = "illegal UTF-16 surrogate";
5672 startinpos = ((const char *)q) - 4 - starts;
5673 endinpos = startinpos + 2;
5674 break;
5675 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005676 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 continue;
5679 }
5680
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005681 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005682 errors,
5683 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005684 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005685 &starts,
5686 (const char **)&e,
5687 &startinpos,
5688 &endinpos,
5689 &exc,
5690 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 }
5694
Antoine Pitrou63065d72012-05-15 23:48:04 +02005695End:
Walter Dörwald69652032004-09-07 20:24:22 +00005696 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 Py_XDECREF(errorHandler);
5700 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005704 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return NULL;
5708}
5709
Tim Peters772747b2001-08-09 22:21:55 +00005710PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005711_PyUnicode_EncodeUTF16(PyObject *str,
5712 const char *errors,
5713 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005715 enum PyUnicode_Kind kind;
5716 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005718 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005719 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005721#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005723#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005724 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005725#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 const char *encoding;
5727 Py_ssize_t nsize, pos;
5728 PyObject *errorHandler = NULL;
5729 PyObject *exc = NULL;
5730 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005731
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005732 if (!PyUnicode_Check(str)) {
5733 PyErr_BadArgument();
5734 return NULL;
5735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005736 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 return NULL;
5738 kind = PyUnicode_KIND(str);
5739 data = PyUnicode_DATA(str);
5740 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005741
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005743 if (kind == PyUnicode_4BYTE_KIND) {
5744 const Py_UCS4 *in = (const Py_UCS4 *)data;
5745 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 while (in < end) {
5747 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005749 }
5750 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005751 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 nsize = len + pairs + (byteorder == 0);
5756 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005762 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005765 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 }
5767 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005768 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
Tim Peters772747b2001-08-09 22:21:55 +00005770
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 if (kind == PyUnicode_1BYTE_KIND) {
5772 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5773 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005774 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005775
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005778 }
5779 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005781 }
5782 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785
5786 pos = 0;
5787 while (pos < len) {
5788 Py_ssize_t repsize, moreunits;
5789
5790 if (kind == PyUnicode_2BYTE_KIND) {
5791 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5792 &out, native_ordering);
5793 }
5794 else {
5795 assert(kind == PyUnicode_4BYTE_KIND);
5796 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5797 &out, native_ordering);
5798 }
5799 if (pos == len)
5800 break;
5801
5802 rep = unicode_encode_call_errorhandler(
5803 errors, &errorHandler,
5804 encoding, "surrogates not allowed",
5805 str, &exc, pos, pos + 1, &pos);
5806 if (!rep)
5807 goto error;
5808
5809 if (PyBytes_Check(rep)) {
5810 repsize = PyBytes_GET_SIZE(rep);
5811 if (repsize & 1) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 moreunits = repsize / 2;
5818 }
5819 else {
5820 assert(PyUnicode_Check(rep));
5821 if (PyUnicode_READY(rep) < 0)
5822 goto error;
5823 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5824 if (!PyUnicode_IS_ASCII(rep)) {
5825 raise_encode_exception(&exc, encoding,
5826 str, pos - 1, pos,
5827 "surrogates not allowed");
5828 goto error;
5829 }
5830 }
5831
5832 /* two bytes are reserved for each surrogate */
5833 if (moreunits > 1) {
5834 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005835 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 /* integer overflow */
5837 PyErr_NoMemory();
5838 goto error;
5839 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005840 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 goto error;
5842 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5843 }
5844
5845 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005846 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005847 out += moreunits;
5848 } else /* rep is unicode */ {
5849 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5850 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5851 &out, native_ordering);
5852 }
5853
5854 Py_CLEAR(rep);
5855 }
5856
5857 /* Cut back to size actually needed. This is necessary for, for example,
5858 encoding of a string containing isolated surrogates and the 'ignore' handler
5859 is used. */
5860 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5861 if (nsize != PyBytes_GET_SIZE(v))
5862 _PyBytes_Resize(&v, nsize);
5863 Py_XDECREF(errorHandler);
5864 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005865 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005866 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005867 error:
5868 Py_XDECREF(rep);
5869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
5871 Py_XDECREF(v);
5872 return NULL;
5873#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874}
5875
Alexander Belopolsky40018472011-02-26 01:02:56 +00005876PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5878 Py_ssize_t size,
5879 const char *errors,
5880 int byteorder)
5881{
5882 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005883 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005884 if (tmp == NULL)
5885 return NULL;
5886 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5887 Py_DECREF(tmp);
5888 return result;
5889}
5890
5891PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895}
5896
5897/* --- Unicode Escape Codec ----------------------------------------------- */
5898
Fredrik Lundh06d12682001-01-24 07:59:11 +00005899static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005902_PyUnicode_DecodeUnicodeEscape(const char *s,
5903 Py_ssize_t size,
5904 const char *errors,
5905 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005908 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 PyObject *errorHandler = NULL;
5911 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005912
Eric V. Smith42454af2016-10-31 09:22:08 -04005913 // so we can remember if we've seen an invalid escape char or not
5914 *first_invalid_escape = NULL;
5915
Victor Stinner62ec3312016-09-06 17:04:34 -07005916 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005917 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005918 }
5919 /* Escaped strings will always be longer than the resulting
5920 Unicode string, so we start with size here and then reduce the
5921 length after conversion to the true value.
5922 (but if the error callback returns a long replacement string
5923 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005924 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005925 writer.min_length = size;
5926 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5927 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005928 }
5929
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 end = s + size;
5931 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005932 unsigned char c = (unsigned char) *s++;
5933 Py_UCS4 ch;
5934 int count;
5935 Py_ssize_t startinpos;
5936 Py_ssize_t endinpos;
5937 const char *message;
5938
5939#define WRITE_ASCII_CHAR(ch) \
5940 do { \
5941 assert(ch <= 127); \
5942 assert(writer.pos < writer.size); \
5943 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5944 } while(0)
5945
5946#define WRITE_CHAR(ch) \
5947 do { \
5948 if (ch <= writer.maxchar) { \
5949 assert(writer.pos < writer.size); \
5950 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5951 } \
5952 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5953 goto onError; \
5954 } \
5955 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005958 if (c != '\\') {
5959 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 continue;
5961 }
5962
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 if (s >= end) {
5966 message = "\\ at end of string";
5967 goto error;
5968 }
5969 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005972 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 case '\n': continue;
5976 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5977 case '\'': WRITE_ASCII_CHAR('\''); continue;
5978 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5979 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005980 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5982 case 't': WRITE_ASCII_CHAR('\t'); continue;
5983 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5984 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 case '0': case '1': case '2': case '3':
5992 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005994 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 ch = (ch<<3) + *s++ - '0';
5996 if (s < end && '0' <= *s && *s <= '7') {
5997 ch = (ch<<3) + *s++ - '0';
5998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 WRITE_CHAR(ch);
6001 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* hex escapes */
6004 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 message = "truncated \\xXX escape";
6008 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 message = "truncated \\uXXXX escape";
6014 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006017 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 message = "truncated \\UXXXXXXXX escape";
6020 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006022 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006023 ch <<= 4;
6024 if (c >= '0' && c <= '9') {
6025 ch += c - '0';
6026 }
6027 else if (c >= 'a' && c <= 'f') {
6028 ch += c - ('a' - 10);
6029 }
6030 else if (c >= 'A' && c <= 'F') {
6031 ch += c - ('A' - 10);
6032 }
6033 else {
6034 break;
6035 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006036 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006038 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006039 }
6040
6041 /* when we get here, ch is a 32-bit unicode character */
6042 if (ch > MAX_UNICODE) {
6043 message = "illegal Unicode character";
6044 goto error;
6045 }
6046
6047 WRITE_CHAR(ch);
6048 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052 if (ucnhash_CAPI == NULL) {
6053 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006054 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6055 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 if (ucnhash_CAPI == NULL) {
6057 PyErr_SetString(
6058 PyExc_UnicodeError,
6059 "\\N escapes not supported (can't load unicodedata module)"
6060 );
6061 goto onError;
6062 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006064
6065 message = "malformed \\N character escape";
Miss Islington (bot)9fbcb142018-11-13 16:39:36 -08006066 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006067 const char *start = ++s;
6068 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 namelen = s - start;
6073 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 ch = 0xffffffff; /* in case 'getcode' messes up */
6077 if (namelen <= INT_MAX &&
6078 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6079 &ch, 0)) {
6080 assert(ch <= MAX_UNICODE);
6081 WRITE_CHAR(ch);
6082 continue;
6083 }
6084 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 }
6086 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006088
6089 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006090 if (*first_invalid_escape == NULL) {
6091 *first_invalid_escape = s-1; /* Back up one char, since we've
6092 already incremented s. */
6093 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 WRITE_ASCII_CHAR('\\');
6095 WRITE_CHAR(c);
6096 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006098
6099 error:
6100 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006101 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006102 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006103 errors, &errorHandler,
6104 "unicodeescape", message,
6105 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006107 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006109 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006110
6111#undef WRITE_ASCII_CHAR
6112#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124}
6125
Eric V. Smith42454af2016-10-31 09:22:08 -04006126PyObject *
6127PyUnicode_DecodeUnicodeEscape(const char *s,
6128 Py_ssize_t size,
6129 const char *errors)
6130{
6131 const char *first_invalid_escape;
6132 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6133 &first_invalid_escape);
6134 if (result == NULL)
6135 return NULL;
6136 if (first_invalid_escape != NULL) {
6137 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6138 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006139 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006140 Py_DECREF(result);
6141 return NULL;
6142 }
6143 }
6144 return result;
6145}
6146
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006147/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Ezio Melottie7f90372012-10-05 03:33:31 +03006159 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006160 escape.
6161
Ezio Melottie7f90372012-10-05 03:33:31 +03006162 For UCS1 strings it's '\xxx', 4 bytes per source character.
6163 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6164 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006165 */
6166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 if (!PyUnicode_Check(unicode)) {
6168 PyErr_BadArgument();
6169 return NULL;
6170 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 }
Victor Stinner358af132015-10-12 22:36:57 +02006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (len == 0) {
6177 return PyBytes_FromStringAndSize(NULL, 0);
6178 }
6179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 kind = PyUnicode_KIND(unicode);
6181 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006185 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 return PyErr_NoMemory();
6187 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (repr == NULL) {
6190 return NULL;
6191 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 /* U+0000-U+00ff range */
6198 if (ch < 0x100) {
6199 if (ch >= ' ' && ch < 127) {
6200 if (ch != '\\') {
6201 /* Copy printable US ASCII as-is */
6202 *p++ = (char) ch;
6203 }
6204 /* Escape backslashes */
6205 else {
6206 *p++ = '\\';
6207 *p++ = '\\';
6208 }
6209 }
Victor Stinner358af132015-10-12 22:36:57 +02006210
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* Map special whitespace to '\t', \n', '\r' */
6212 else if (ch == '\t') {
6213 *p++ = '\\';
6214 *p++ = 't';
6215 }
6216 else if (ch == '\n') {
6217 *p++ = '\\';
6218 *p++ = 'n';
6219 }
6220 else if (ch == '\r') {
6221 *p++ = '\\';
6222 *p++ = 'r';
6223 }
6224
6225 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6226 else {
6227 *p++ = '\\';
6228 *p++ = 'x';
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
Tim Petersced69f82003-09-16 20:30:58 +00006232 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006233 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 *p++ = '\\';
6236 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006237 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6243 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 /* Make sure that the first two digits are zero */
6246 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 *p++ = 'U';
6249 *p++ = '0';
6250 *p++ = '0';
6251 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6256 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 assert(p - PyBytes_AS_STRING(repr) > 0);
6261 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6262 return NULL;
6263 }
6264 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006271 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006272 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006273 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 }
6276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 result = PyUnicode_AsUnicodeEscapeString(tmp);
6278 Py_DECREF(tmp);
6279 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Raw Unicode Escape Codec ------------------------------------------- */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006290 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 PyObject *errorHandler = NULL;
6293 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 /* Escaped strings will always be longer than the resulting
6300 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 length after conversion to the true value. (But decoding error
6302 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006303 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 writer.min_length = size;
6305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6306 goto onError;
6307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 end = s + size;
6310 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 unsigned char c = (unsigned char) *s++;
6312 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006313 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 Py_ssize_t startinpos;
6315 Py_ssize_t endinpos;
6316 const char *message;
6317
6318#define WRITE_CHAR(ch) \
6319 do { \
6320 if (ch <= writer.maxchar) { \
6321 assert(writer.pos < writer.size); \
6322 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6323 } \
6324 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6325 goto onError; \
6326 } \
6327 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (c != '\\' || s >= end) {
6331 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006334
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 c = (unsigned char) *s++;
6336 if (c == 'u') {
6337 count = 4;
6338 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 else if (c == 'U') {
6341 count = 8;
6342 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 }
6344 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 assert(writer.pos < writer.size);
6346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6347 WRITE_CHAR(c);
6348 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 startinpos = s - starts - 2;
6351
6352 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6353 for (ch = 0; count && s < end; ++s, --count) {
6354 c = (unsigned char)*s;
6355 ch <<= 4;
6356 if (c >= '0' && c <= '9') {
6357 ch += c - '0';
6358 }
6359 else if (c >= 'a' && c <= 'f') {
6360 ch += c - ('a' - 10);
6361 }
6362 else if (c >= 'A' && c <= 'F') {
6363 ch += c - ('A' - 10);
6364 }
6365 else {
6366 break;
6367 }
6368 }
6369 if (!count) {
6370 if (ch <= MAX_UNICODE) {
6371 WRITE_CHAR(ch);
6372 continue;
6373 }
6374 message = "\\Uxxxxxxxx out of range";
6375 }
6376
6377 endinpos = s-starts;
6378 writer.min_length = end - s + writer.pos;
6379 if (unicode_decode_call_errorhandler_writer(
6380 errors, &errorHandler,
6381 "rawunicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
6383 &writer)) {
6384 goto onError;
6385 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006386 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006387
6388#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 Py_XDECREF(errorHandler);
6391 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006392 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006393
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400}
6401
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402
Alexander Belopolsky40018472011-02-26 01:02:56 +00006403PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409 int kind;
6410 void *data;
6411 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006413 if (!PyUnicode_Check(unicode)) {
6414 PyErr_BadArgument();
6415 return NULL;
6416 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 kind = PyUnicode_KIND(unicode);
6421 data = PyUnicode_DATA(unicode);
6422 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (kind == PyUnicode_1BYTE_KIND) {
6424 return PyBytes_FromStringAndSize(data, len);
6425 }
Victor Stinner0e368262011-11-10 20:12:49 +01006426
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6428 bytes, and 1 byte characters 4. */
6429 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006430
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 if (len > PY_SSIZE_T_MAX / expandsize) {
6432 return PyErr_NoMemory();
6433 }
6434 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6435 if (repr == NULL) {
6436 return NULL;
6437 }
6438 if (len == 0) {
6439 return repr;
6440 }
6441
6442 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443 for (pos = 0; pos < len; pos++) {
6444 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006445
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6447 if (ch < 0x100) {
6448 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006449 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6451 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 *p++ = '\\';
6453 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006454 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6455 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6456 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6457 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6460 else {
6461 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6462 *p++ = '\\';
6463 *p++ = 'U';
6464 *p++ = '0';
6465 *p++ = '0';
6466 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6467 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6468 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6471 *p++ = Py_hexdigits[ch & 15];
6472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006474
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 assert(p > PyBytes_AS_STRING(repr));
6476 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6477 return NULL;
6478 }
6479 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006487 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006489 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6491 Py_DECREF(tmp);
6492 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006495/* --- Unicode Internal Codec ------------------------------------------- */
6496
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497PyObject *
6498_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006499 Py_ssize_t size,
6500 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501{
6502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t startinpos;
6504 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006505 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006506 const char *end;
6507 const char *reason;
6508 PyObject *errorHandler = NULL;
6509 PyObject *exc = NULL;
6510
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006511 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006512 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006513 1))
6514 return NULL;
6515
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006516 if (size < 0) {
6517 PyErr_BadInternalCall();
6518 return NULL;
6519 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006520 if (size == 0)
6521 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006522
Victor Stinner8f674cc2013-04-17 23:02:17 +02006523 _PyUnicodeWriter_Init(&writer);
6524 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6525 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006527 }
6528 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006529
Victor Stinner8f674cc2013-04-17 23:02:17 +02006530 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006531 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006532 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006533 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006534 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006535 endinpos = end-starts;
6536 reason = "truncated input";
6537 goto error;
6538 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006539 /* We copy the raw representation one byte at a time because the
6540 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006541 ((char *) &uch)[0] = s[0];
6542 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006543#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 ((char *) &uch)[2] = s[2];
6545 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006548#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006549 /* We have to sanity check the raw data, otherwise doom looms for
6550 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006551 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006552 endinpos = s - starts + Py_UNICODE_SIZE;
6553 reason = "illegal code point (> 0x10FFFF)";
6554 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006555 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006556#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557 s += Py_UNICODE_SIZE;
6558#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006559 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006561 Py_UNICODE uch2;
6562 ((char *) &uch2)[0] = s[0];
6563 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006564 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 {
Victor Stinner551ac952011-11-29 22:58:13 +01006566 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006568 }
6569 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570#endif
6571
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006572 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006574 continue;
6575
6576 error:
6577 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006578 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006579 errors, &errorHandler,
6580 "unicode_internal", reason,
6581 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006582 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006583 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006584 }
6585
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006586 Py_XDECREF(errorHandler);
6587 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006588 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006591 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
6594 return NULL;
6595}
6596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597/* --- Latin-1 Codec ------------------------------------------------------ */
6598
Alexander Belopolsky40018472011-02-26 01:02:56 +00006599PyObject *
6600PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006601 Py_ssize_t size,
6602 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006605 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606}
6607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609static void
6610make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006612 PyObject *unicode,
6613 Py_ssize_t startpos, Py_ssize_t endpos,
6614 const char *reason)
6615{
6616 if (*exceptionObject == NULL) {
6617 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006619 encoding, unicode, startpos, endpos, reason);
6620 }
6621 else {
6622 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6623 goto onError;
6624 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6625 goto onError;
6626 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6627 goto onError;
6628 return;
6629 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006630 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006631 }
6632}
6633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006635static void
6636raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006637 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006638 PyObject *unicode,
6639 Py_ssize_t startpos, Py_ssize_t endpos,
6640 const char *reason)
6641{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006642 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006643 encoding, unicode, startpos, endpos, reason);
6644 if (*exceptionObject != NULL)
6645 PyCodec_StrictErrors(*exceptionObject);
6646}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647
6648/* error handling callback helper:
6649 build arguments, call the callback and check the arguments,
6650 put the result into newpos and return the replacement string, which
6651 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652static PyObject *
6653unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006654 PyObject **errorHandler,
6655 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 Py_ssize_t startpos, Py_ssize_t endpos,
6658 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006660 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 PyObject *restuple;
6663 PyObject *resunicode;
6664
6665 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 }
6670
Benjamin Petersonbac79492012-01-14 13:34:47 -05006671 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 return NULL;
6673 len = PyUnicode_GET_LENGTH(unicode);
6674
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006675 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006680 restuple = PyObject_CallFunctionObjArgs(
6681 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006685 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 Py_DECREF(restuple);
6687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006689 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 &resunicode, newpos)) {
6691 Py_DECREF(restuple);
6692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006694 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6695 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6696 Py_DECREF(restuple);
6697 return NULL;
6698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700 *newpos = len + *newpos;
6701 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 Py_DECREF(restuple);
6704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 Py_INCREF(resunicode);
6707 Py_DECREF(restuple);
6708 return resunicode;
6709}
6710
Alexander Belopolsky40018472011-02-26 01:02:56 +00006711static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006713 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006714 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 /* input state */
6717 Py_ssize_t pos=0, size;
6718 int kind;
6719 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720 /* pointer into the output */
6721 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006722 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6723 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006724 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006726 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006727 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728 /* output object */
6729 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730
Benjamin Petersonbac79492012-01-14 13:34:47 -05006731 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 return NULL;
6733 size = PyUnicode_GET_LENGTH(unicode);
6734 kind = PyUnicode_KIND(unicode);
6735 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 /* allocate enough for a simple encoding without
6737 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006738 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006739 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006740
6741 _PyBytesWriter_Init(&writer);
6742 str = _PyBytesWriter_Alloc(&writer, size);
6743 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006747 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006750 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006752 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006754 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006756 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006759 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006761
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006762 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006764
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006765 /* Only overallocate the buffer if it's not the last write */
6766 writer.overallocate = (collend < size);
6767
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006769 if (error_handler == _Py_ERROR_UNKNOWN)
6770 error_handler = get_error_handler(errors);
6771
6772 switch (error_handler) {
6773 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006774 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006776
6777 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006778 memset(str, '?', collend - collstart);
6779 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006780 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006781 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 break;
Victor Stinner50149202015-09-22 00:26:54 +02006784
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006785 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006786 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006787 writer.min_size -= (collend - collstart);
6788 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006789 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006790 if (str == NULL)
6791 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006792 pos = collend;
6793 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006794
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006796 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006797 writer.min_size -= (collend - collstart);
6798 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006799 unicode, collstart, collend);
6800 if (str == NULL)
6801 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006802 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 break;
Victor Stinner50149202015-09-22 00:26:54 +02006804
Victor Stinnerc3713e92015-09-29 12:32:13 +02006805 case _Py_ERROR_SURROGATEESCAPE:
6806 for (i = collstart; i < collend; ++i) {
6807 ch = PyUnicode_READ(kind, data, i);
6808 if (ch < 0xdc80 || 0xdcff < ch) {
6809 /* Not a UTF-8b surrogate */
6810 break;
6811 }
6812 *str++ = (char)(ch - 0xdc00);
6813 ++pos;
6814 }
6815 if (i >= collend)
6816 break;
6817 collstart = pos;
6818 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006819 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006820
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006822 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6823 encoding, reason, unicode, &exc,
6824 collstart, collend, &newpos);
6825 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006827
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006828 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006829 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006830
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006832 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006833 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006834 PyBytes_AS_STRING(rep),
6835 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006836 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 else {
6838 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006839
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006843 if (limit == 256 ?
6844 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6845 !PyUnicode_IS_ASCII(rep))
6846 {
6847 /* Not all characters are smaller than limit */
6848 raise_encode_exception(&exc, encoding, unicode,
6849 collstart, collend, reason);
6850 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006852 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6853 str = _PyBytesWriter_WriteBytes(&writer, str,
6854 PyUnicode_DATA(rep),
6855 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 }
Miss Islington (bot)1e596d32018-08-19 16:17:53 -04006857 if (str == NULL)
6858 goto onError;
6859
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006861 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863
6864 /* If overallocation was disabled, ensure that it was the last
6865 write. Otherwise, we missed an optimization */
6866 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
6868 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006869
Victor Stinner50149202015-09-22 00:26:54 +02006870 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006872 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006873
6874 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006875 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006876 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006877 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878 Py_XDECREF(exc);
6879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880}
6881
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883PyObject *
6884PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006885 Py_ssize_t size,
6886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006889 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 if (unicode == NULL)
6891 return NULL;
6892 result = unicode_encode_ucs1(unicode, errors, 256);
6893 Py_DECREF(unicode);
6894 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Alexander Belopolsky40018472011-02-26 01:02:56 +00006897PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006898_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 PyErr_BadArgument();
6902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904 if (PyUnicode_READY(unicode) == -1)
6905 return NULL;
6906 /* Fast path: if it is a one-byte string, construct
6907 bytes object directly. */
6908 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6909 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6910 PyUnicode_GET_LENGTH(unicode));
6911 /* Non-Latin-1 characters present. Defer to above function to
6912 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914}
6915
6916PyObject*
6917PyUnicode_AsLatin1String(PyObject *unicode)
6918{
6919 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
6922/* --- 7-bit ASCII Codec -------------------------------------------------- */
6923
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_DecodeASCII(const char *s,
6926 Py_ssize_t size,
6927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006931 int kind;
6932 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t startinpos;
6934 Py_ssize_t endinpos;
6935 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006942 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006945 if (size == 1 && (unsigned char)s[0] < 128)
6946 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006947
Victor Stinner8f674cc2013-04-17 23:02:17 +02006948 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006949 writer.min_length = size;
6950 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006951 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006954 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006955 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006956 writer.pos = outpos;
6957 if (writer.pos == size)
6958 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006959
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 s += writer.pos;
6961 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006963 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 PyUnicode_WRITE(kind, data, writer.pos, c);
6966 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006968 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970
6971 /* byte outsize range 0x00..0x7f: call the error handler */
6972
6973 if (error_handler == _Py_ERROR_UNKNOWN)
6974 error_handler = get_error_handler(errors);
6975
6976 switch (error_handler)
6977 {
6978 case _Py_ERROR_REPLACE:
6979 case _Py_ERROR_SURROGATEESCAPE:
6980 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006981 but we may switch to UCS2 at the first write */
6982 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6983 goto onError;
6984 kind = writer.kind;
6985 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006986
6987 if (error_handler == _Py_ERROR_REPLACE)
6988 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6989 else
6990 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6991 writer.pos++;
6992 ++s;
6993 break;
6994
6995 case _Py_ERROR_IGNORE:
6996 ++s;
6997 break;
6998
6999 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 startinpos = s-starts;
7001 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007003 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 "ascii", "ordinal not in range(128)",
7005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 kind = writer.kind;
7009 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007012 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007015
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 return NULL;
7021}
7022
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024PyObject *
7025PyUnicode_EncodeASCII(const Py_UNICODE *p,
7026 Py_ssize_t size,
7027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007030 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 if (unicode == NULL)
7032 return NULL;
7033 result = unicode_encode_ucs1(unicode, errors, 128);
7034 Py_DECREF(unicode);
7035 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Alexander Belopolsky40018472011-02-26 01:02:56 +00007038PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007039_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040{
7041 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 PyErr_BadArgument();
7043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045 if (PyUnicode_READY(unicode) == -1)
7046 return NULL;
7047 /* Fast path: if it is an ASCII-only string, construct bytes object
7048 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007049 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7051 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007052 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007053}
7054
7055PyObject *
7056PyUnicode_AsASCIIString(PyObject *unicode)
7057{
7058 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059}
7060
Steve Dowercc16be82016-09-08 10:35:16 -07007061#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007062
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007063/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007065#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066#define NEED_RETRY
7067#endif
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069#ifndef WC_ERR_INVALID_CHARS
7070# define WC_ERR_INVALID_CHARS 0x0080
7071#endif
7072
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007073static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007074code_page_name(UINT code_page, PyObject **obj)
7075{
7076 *obj = NULL;
7077 if (code_page == CP_ACP)
7078 return "mbcs";
7079 if (code_page == CP_UTF7)
7080 return "CP_UTF7";
7081 if (code_page == CP_UTF8)
7082 return "CP_UTF8";
7083
7084 *obj = PyBytes_FromFormat("cp%u", code_page);
7085 if (*obj == NULL)
7086 return NULL;
7087 return PyBytes_AS_STRING(*obj);
7088}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090static DWORD
7091decode_code_page_flags(UINT code_page)
7092{
7093 if (code_page == CP_UTF7) {
7094 /* The CP_UTF7 decoder only supports flags=0 */
7095 return 0;
7096 }
7097 else
7098 return MB_ERR_INVALID_CHARS;
7099}
7100
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 * Decode a byte string from a Windows code page into unicode object in strict
7103 * mode.
7104 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007105 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7106 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007109decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007110 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const char *in,
7112 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113{
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007115 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
7118 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 assert(insize > 0);
7120 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7121 if (outsize <= 0)
7122 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007126 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007127 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 if (*v == NULL)
7129 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 }
7132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007135 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138 }
7139
7140 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7142 if (outsize <= 0)
7143 goto error;
7144 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146error:
7147 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7148 return -2;
7149 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151}
7152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153/*
7154 * Decode a byte string from a code page into unicode object with an error
7155 * handler.
7156 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007157 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 * UnicodeDecodeError exception and returns -1 on error.
7159 */
7160static int
7161decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007162 PyObject **v,
7163 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007164 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007165{
7166 const char *startin = in;
7167 const char *endin = in + size;
7168 const DWORD flags = decode_code_page_flags(code_page);
7169 /* Ideally, we should get reason from FormatMessage. This is the Windows
7170 2000 English version of the message. */
7171 const char *reason = "No mapping for the Unicode character exists "
7172 "in the target code page.";
7173 /* each step cannot decode more than 1 character, but a character can be
7174 represented as a surrogate pair */
7175 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 int insize;
7177 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 PyObject *errorHandler = NULL;
7179 PyObject *exc = NULL;
7180 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007181 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 DWORD err;
7183 int ret = -1;
7184
7185 assert(size > 0);
7186
7187 encoding = code_page_name(code_page, &encoding_obj);
7188 if (encoding == NULL)
7189 return -1;
7190
Victor Stinner7d00cc12014-03-17 23:08:06 +01007191 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7193 UnicodeDecodeError. */
7194 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7195 if (exc != NULL) {
7196 PyCodec_StrictErrors(exc);
7197 Py_CLEAR(exc);
7198 }
7199 goto error;
7200 }
7201
7202 if (*v == NULL) {
7203 /* Create unicode object */
7204 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinnerab595942011-12-17 04:59:06 +01007208 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 if (*v == NULL)
7211 goto error;
7212 startout = PyUnicode_AS_UNICODE(*v);
7213 }
7214 else {
7215 /* Extend unicode object */
7216 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7217 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7218 PyErr_NoMemory();
7219 goto error;
7220 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007221 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 goto error;
7223 startout = PyUnicode_AS_UNICODE(*v) + n;
7224 }
7225
7226 /* Decode the byte string character per character */
7227 out = startout;
7228 while (in < endin)
7229 {
7230 /* Decode a character */
7231 insize = 1;
7232 do
7233 {
7234 outsize = MultiByteToWideChar(code_page, flags,
7235 in, insize,
7236 buffer, Py_ARRAY_LENGTH(buffer));
7237 if (outsize > 0)
7238 break;
7239 err = GetLastError();
7240 if (err != ERROR_NO_UNICODE_TRANSLATION
7241 && err != ERROR_INSUFFICIENT_BUFFER)
7242 {
7243 PyErr_SetFromWindowsErr(0);
7244 goto error;
7245 }
7246 insize++;
7247 }
7248 /* 4=maximum length of a UTF-8 sequence */
7249 while (insize <= 4 && (in + insize) <= endin);
7250
7251 if (outsize <= 0) {
7252 Py_ssize_t startinpos, endinpos, outpos;
7253
Victor Stinner7d00cc12014-03-17 23:08:06 +01007254 /* last character in partial decode? */
7255 if (in + insize >= endin && !final)
7256 break;
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 startinpos = in - startin;
7259 endinpos = startinpos + 1;
7260 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007261 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 errors, &errorHandler,
7263 encoding, reason,
7264 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007265 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 {
7267 goto error;
7268 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007269 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 }
7271 else {
7272 in += insize;
7273 memcpy(out, buffer, outsize * sizeof(wchar_t));
7274 out += outsize;
7275 }
7276 }
7277
7278 /* write a NUL character at the end */
7279 *out = 0;
7280
7281 /* Extend unicode object */
7282 outsize = out - startout;
7283 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007284 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007286 /* (in - startin) <= size and size is an int */
7287 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007288
7289error:
7290 Py_XDECREF(encoding_obj);
7291 Py_XDECREF(errorHandler);
7292 Py_XDECREF(exc);
7293 return ret;
7294}
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296static PyObject *
7297decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 const char *s, Py_ssize_t size,
7299 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007300{
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 PyObject *v = NULL;
7302 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 if (code_page < 0) {
7305 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7306 return NULL;
7307 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007308 if (size < 0) {
7309 PyErr_BadInternalCall();
7310 return NULL;
7311 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007312
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315
Victor Stinner76a31a62011-11-04 00:05:13 +01007316 do
7317 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 if (size > INT_MAX) {
7320 chunk_size = INT_MAX;
7321 final = 0;
7322 done = 0;
7323 }
7324 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 {
7327 chunk_size = (int)size;
7328 final = (consumed == NULL);
7329 done = 1;
7330 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 if (chunk_size == 0 && done) {
7333 if (v != NULL)
7334 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007335 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 converted = decode_code_page_strict(code_page, &v,
7339 s, chunk_size);
7340 if (converted == -2)
7341 converted = decode_code_page_errors(code_page, &v,
7342 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007343 errors, final);
7344 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007345
7346 if (converted < 0) {
7347 Py_XDECREF(v);
7348 return NULL;
7349 }
7350
7351 if (consumed)
7352 *consumed += converted;
7353
7354 s += converted;
7355 size -= converted;
7356 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007357
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007358 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359}
7360
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007362PyUnicode_DecodeCodePageStateful(int code_page,
7363 const char *s,
7364 Py_ssize_t size,
7365 const char *errors,
7366 Py_ssize_t *consumed)
7367{
7368 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7369}
7370
7371PyObject *
7372PyUnicode_DecodeMBCSStateful(const char *s,
7373 Py_ssize_t size,
7374 const char *errors,
7375 Py_ssize_t *consumed)
7376{
7377 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7378}
7379
7380PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381PyUnicode_DecodeMBCS(const char *s,
7382 Py_ssize_t size,
7383 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007384{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7386}
7387
Victor Stinner3a50e702011-10-18 21:21:00 +02007388static DWORD
7389encode_code_page_flags(UINT code_page, const char *errors)
7390{
7391 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007392 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 }
7394 else if (code_page == CP_UTF7) {
7395 /* CP_UTF7 only supports flags=0 */
7396 return 0;
7397 }
7398 else {
7399 if (errors != NULL && strcmp(errors, "replace") == 0)
7400 return 0;
7401 else
7402 return WC_NO_BEST_FIT_CHARS;
7403 }
7404}
7405
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 * Encode a Unicode string to a Windows code page into a byte string in strict
7408 * mode.
7409 *
7410 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007411 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007413static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007414encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007415 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417{
Victor Stinner554f3f02010-06-16 23:33:54 +00007418 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 BOOL *pusedDefaultChar = &usedDefaultChar;
7420 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007421 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 const DWORD flags = encode_code_page_flags(code_page, NULL);
7424 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 /* Create a substring so that we can get the UTF-16 representation
7426 of just the slice under consideration. */
7427 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007428
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007430
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007432 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007434 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007435
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 substring = PyUnicode_Substring(unicode, offset, offset+len);
7437 if (substring == NULL)
7438 return -1;
7439 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7440 if (p == NULL) {
7441 Py_DECREF(substring);
7442 return -1;
7443 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007444 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007445
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007446 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007448 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 NULL, 0,
7450 NULL, pusedDefaultChar);
7451 if (outsize <= 0)
7452 goto error;
7453 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 if (pusedDefaultChar && *pusedDefaultChar) {
7455 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007458
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 if (*outbytes == NULL) {
7463 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467 }
7468 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 const Py_ssize_t n = PyBytes_Size(*outbytes);
7471 if (outsize > PY_SSIZE_T_MAX - n) {
7472 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7477 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481 }
7482
7483 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007485 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 out, outsize,
7487 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 if (outsize <= 0)
7490 goto error;
7491 if (pusedDefaultChar && *pusedDefaultChar)
7492 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7498 return -2;
7499 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007500 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
7502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007504 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 * error handler.
7506 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007507 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 * -1 on other error.
7509 */
7510static int
7511encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007514{
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007516 Py_ssize_t pos = unicode_offset;
7517 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 /* Ideally, we should get reason from FormatMessage. This is the Windows
7519 2000 English version of the message. */
7520 const char *reason = "invalid character";
7521 /* 4=maximum length of a UTF-8 sequence */
7522 char buffer[4];
7523 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7524 Py_ssize_t outsize;
7525 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 PyObject *errorHandler = NULL;
7527 PyObject *exc = NULL;
7528 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007529 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 PyObject *rep;
7532 int ret = -1;
7533
7534 assert(insize > 0);
7535
7536 encoding = code_page_name(code_page, &encoding_obj);
7537 if (encoding == NULL)
7538 return -1;
7539
7540 if (errors == NULL || strcmp(errors, "strict") == 0) {
7541 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7542 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007543 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 if (exc != NULL) {
7545 PyCodec_StrictErrors(exc);
7546 Py_DECREF(exc);
7547 }
7548 Py_XDECREF(encoding_obj);
7549 return -1;
7550 }
7551
7552 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7553 pusedDefaultChar = &usedDefaultChar;
7554 else
7555 pusedDefaultChar = NULL;
7556
7557 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7558 PyErr_NoMemory();
7559 goto error;
7560 }
7561 outsize = insize * Py_ARRAY_LENGTH(buffer);
7562
7563 if (*outbytes == NULL) {
7564 /* Create string object */
7565 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7566 if (*outbytes == NULL)
7567 goto error;
7568 out = PyBytes_AS_STRING(*outbytes);
7569 }
7570 else {
7571 /* Extend string object */
7572 Py_ssize_t n = PyBytes_Size(*outbytes);
7573 if (n > PY_SSIZE_T_MAX - outsize) {
7574 PyErr_NoMemory();
7575 goto error;
7576 }
7577 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7578 goto error;
7579 out = PyBytes_AS_STRING(*outbytes) + n;
7580 }
7581
7582 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007585 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7586 wchar_t chars[2];
7587 int charsize;
7588 if (ch < 0x10000) {
7589 chars[0] = (wchar_t)ch;
7590 charsize = 1;
7591 }
7592 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007593 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7594 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 charsize = 2;
7596 }
7597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 buffer, Py_ARRAY_LENGTH(buffer),
7601 NULL, pusedDefaultChar);
7602 if (outsize > 0) {
7603 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7604 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 memcpy(out, buffer, outsize);
7607 out += outsize;
7608 continue;
7609 }
7610 }
7611 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7612 PyErr_SetFromWindowsErr(0);
7613 goto error;
7614 }
7615
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 rep = unicode_encode_call_errorhandler(
7617 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007618 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 if (rep == NULL)
7621 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007623
7624 if (PyBytes_Check(rep)) {
7625 outsize = PyBytes_GET_SIZE(rep);
7626 if (outsize != 1) {
7627 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7628 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7629 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7630 Py_DECREF(rep);
7631 goto error;
7632 }
7633 out = PyBytes_AS_STRING(*outbytes) + offset;
7634 }
7635 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7636 out += outsize;
7637 }
7638 else {
7639 Py_ssize_t i;
7640 enum PyUnicode_Kind kind;
7641 void *data;
7642
Benjamin Petersonbac79492012-01-14 13:34:47 -05007643 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 Py_DECREF(rep);
7645 goto error;
7646 }
7647
7648 outsize = PyUnicode_GET_LENGTH(rep);
7649 if (outsize != 1) {
7650 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7651 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7652 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7653 Py_DECREF(rep);
7654 goto error;
7655 }
7656 out = PyBytes_AS_STRING(*outbytes) + offset;
7657 }
7658 kind = PyUnicode_KIND(rep);
7659 data = PyUnicode_DATA(rep);
7660 for (i=0; i < outsize; i++) {
7661 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7662 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007663 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007664 encoding, unicode,
7665 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 "unable to encode error handler result to ASCII");
7667 Py_DECREF(rep);
7668 goto error;
7669 }
7670 *out = (unsigned char)ch;
7671 out++;
7672 }
7673 }
7674 Py_DECREF(rep);
7675 }
7676 /* write a NUL byte */
7677 *out = 0;
7678 outsize = out - PyBytes_AS_STRING(*outbytes);
7679 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7680 if (_PyBytes_Resize(outbytes, outsize) < 0)
7681 goto error;
7682 ret = 0;
7683
7684error:
7685 Py_XDECREF(encoding_obj);
7686 Py_XDECREF(errorHandler);
7687 Py_XDECREF(exc);
7688 return ret;
7689}
7690
Victor Stinner3a50e702011-10-18 21:21:00 +02007691static PyObject *
7692encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 const char *errors)
7695{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007700
Victor Stinner29dacf22015-01-26 16:41:32 +01007701 if (!PyUnicode_Check(unicode)) {
7702 PyErr_BadArgument();
7703 return NULL;
7704 }
7705
Benjamin Petersonbac79492012-01-14 13:34:47 -05007706 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007707 return NULL;
7708 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007709
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 if (code_page < 0) {
7711 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7712 return NULL;
7713 }
7714
Martin v. Löwis3d325192011-11-04 18:23:06 +01007715 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007716 return PyBytes_FromStringAndSize(NULL, 0);
7717
Victor Stinner7581cef2011-11-03 22:32:33 +01007718 offset = 0;
7719 do
7720 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007721#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007722 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007723 chunks. */
7724 if (len > INT_MAX/2) {
7725 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 done = 0;
7727 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 done = 1;
7733 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007734
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 errors);
7738 if (ret == -2)
7739 ret = encode_code_page_errors(code_page, &outbytes,
7740 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 if (ret < 0) {
7743 Py_XDECREF(outbytes);
7744 return NULL;
7745 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 return outbytes;
7752}
7753
7754PyObject *
7755PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7756 Py_ssize_t size,
7757 const char *errors)
7758{
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007760 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007761 if (unicode == NULL)
7762 return NULL;
7763 res = encode_code_page(CP_ACP, unicode, errors);
7764 Py_DECREF(unicode);
7765 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007766}
7767
7768PyObject *
7769PyUnicode_EncodeCodePage(int code_page,
7770 PyObject *unicode,
7771 const char *errors)
7772{
Victor Stinner7581cef2011-11-03 22:32:33 +01007773 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007774}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776PyObject *
7777PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007778{
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007780}
7781
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782#undef NEED_RETRY
7783
Steve Dowercc16be82016-09-08 10:35:16 -07007784#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007785
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786/* --- Character Mapping Codec -------------------------------------------- */
7787
Victor Stinnerfb161b12013-04-18 01:44:27 +02007788static int
7789charmap_decode_string(const char *s,
7790 Py_ssize_t size,
7791 PyObject *mapping,
7792 const char *errors,
7793 _PyUnicodeWriter *writer)
7794{
7795 const char *starts = s;
7796 const char *e;
7797 Py_ssize_t startinpos, endinpos;
7798 PyObject *errorHandler = NULL, *exc = NULL;
7799 Py_ssize_t maplen;
7800 enum PyUnicode_Kind mapkind;
7801 void *mapdata;
7802 Py_UCS4 x;
7803 unsigned char ch;
7804
7805 if (PyUnicode_READY(mapping) == -1)
7806 return -1;
7807
7808 maplen = PyUnicode_GET_LENGTH(mapping);
7809 mapdata = PyUnicode_DATA(mapping);
7810 mapkind = PyUnicode_KIND(mapping);
7811
7812 e = s + size;
7813
7814 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7815 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7816 * is disabled in encoding aliases, latin1 is preferred because
7817 * its implementation is faster. */
7818 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7819 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7820 Py_UCS4 maxchar = writer->maxchar;
7821
7822 assert (writer->kind == PyUnicode_1BYTE_KIND);
7823 while (s < e) {
7824 ch = *s;
7825 x = mapdata_ucs1[ch];
7826 if (x > maxchar) {
7827 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7828 goto onError;
7829 maxchar = writer->maxchar;
7830 outdata = (Py_UCS1 *)writer->data;
7831 }
7832 outdata[writer->pos] = x;
7833 writer->pos++;
7834 ++s;
7835 }
7836 return 0;
7837 }
7838
7839 while (s < e) {
7840 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7841 enum PyUnicode_Kind outkind = writer->kind;
7842 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7843 if (outkind == PyUnicode_1BYTE_KIND) {
7844 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7845 Py_UCS4 maxchar = writer->maxchar;
7846 while (s < e) {
7847 ch = *s;
7848 x = mapdata_ucs2[ch];
7849 if (x > maxchar)
7850 goto Error;
7851 outdata[writer->pos] = x;
7852 writer->pos++;
7853 ++s;
7854 }
7855 break;
7856 }
7857 else if (outkind == PyUnicode_2BYTE_KIND) {
7858 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7859 while (s < e) {
7860 ch = *s;
7861 x = mapdata_ucs2[ch];
7862 if (x == 0xFFFE)
7863 goto Error;
7864 outdata[writer->pos] = x;
7865 writer->pos++;
7866 ++s;
7867 }
7868 break;
7869 }
7870 }
7871 ch = *s;
7872
7873 if (ch < maplen)
7874 x = PyUnicode_READ(mapkind, mapdata, ch);
7875 else
7876 x = 0xfffe; /* invalid value */
7877Error:
7878 if (x == 0xfffe)
7879 {
7880 /* undefined mapping */
7881 startinpos = s-starts;
7882 endinpos = startinpos+1;
7883 if (unicode_decode_call_errorhandler_writer(
7884 errors, &errorHandler,
7885 "charmap", "character maps to <undefined>",
7886 &starts, &e, &startinpos, &endinpos, &exc, &s,
7887 writer)) {
7888 goto onError;
7889 }
7890 continue;
7891 }
7892
7893 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7894 goto onError;
7895 ++s;
7896 }
7897 Py_XDECREF(errorHandler);
7898 Py_XDECREF(exc);
7899 return 0;
7900
7901onError:
7902 Py_XDECREF(errorHandler);
7903 Py_XDECREF(exc);
7904 return -1;
7905}
7906
7907static int
7908charmap_decode_mapping(const char *s,
7909 Py_ssize_t size,
7910 PyObject *mapping,
7911 const char *errors,
7912 _PyUnicodeWriter *writer)
7913{
7914 const char *starts = s;
7915 const char *e;
7916 Py_ssize_t startinpos, endinpos;
7917 PyObject *errorHandler = NULL, *exc = NULL;
7918 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007919 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007920
7921 e = s + size;
7922
7923 while (s < e) {
7924 ch = *s;
7925
7926 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7927 key = PyLong_FromLong((long)ch);
7928 if (key == NULL)
7929 goto onError;
7930
7931 item = PyObject_GetItem(mapping, key);
7932 Py_DECREF(key);
7933 if (item == NULL) {
7934 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7935 /* No mapping found means: mapping is undefined. */
7936 PyErr_Clear();
7937 goto Undefined;
7938 } else
7939 goto onError;
7940 }
7941
7942 /* Apply mapping */
7943 if (item == Py_None)
7944 goto Undefined;
7945 if (PyLong_Check(item)) {
7946 long value = PyLong_AS_LONG(item);
7947 if (value == 0xFFFE)
7948 goto Undefined;
7949 if (value < 0 || value > MAX_UNICODE) {
7950 PyErr_Format(PyExc_TypeError,
7951 "character mapping must be in range(0x%lx)",
7952 (unsigned long)MAX_UNICODE + 1);
7953 goto onError;
7954 }
7955
7956 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7957 goto onError;
7958 }
7959 else if (PyUnicode_Check(item)) {
7960 if (PyUnicode_READY(item) == -1)
7961 goto onError;
7962 if (PyUnicode_GET_LENGTH(item) == 1) {
7963 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7964 if (value == 0xFFFE)
7965 goto Undefined;
7966 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7967 goto onError;
7968 }
7969 else {
7970 writer->overallocate = 1;
7971 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7972 goto onError;
7973 }
7974 }
7975 else {
7976 /* wrong return value */
7977 PyErr_SetString(PyExc_TypeError,
7978 "character mapping must return integer, None or str");
7979 goto onError;
7980 }
7981 Py_CLEAR(item);
7982 ++s;
7983 continue;
7984
7985Undefined:
7986 /* undefined mapping */
7987 Py_CLEAR(item);
7988 startinpos = s-starts;
7989 endinpos = startinpos+1;
7990 if (unicode_decode_call_errorhandler_writer(
7991 errors, &errorHandler,
7992 "charmap", "character maps to <undefined>",
7993 &starts, &e, &startinpos, &endinpos, &exc, &s,
7994 writer)) {
7995 goto onError;
7996 }
7997 }
7998 Py_XDECREF(errorHandler);
7999 Py_XDECREF(exc);
8000 return 0;
8001
8002onError:
8003 Py_XDECREF(item);
8004 Py_XDECREF(errorHandler);
8005 Py_XDECREF(exc);
8006 return -1;
8007}
8008
Alexander Belopolsky40018472011-02-26 01:02:56 +00008009PyObject *
8010PyUnicode_DecodeCharmap(const char *s,
8011 Py_ssize_t size,
8012 PyObject *mapping,
8013 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008015 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 /* Default to Latin-1 */
8018 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008022 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008023 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008024 writer.min_length = size;
8025 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008027
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008028 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008029 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8030 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008031 }
8032 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008033 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008036 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008037
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008039 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 return NULL;
8041}
8042
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043/* Charmap encoding: the lookup table */
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 PyObject_HEAD
8047 unsigned char level1[32];
8048 int count2, count3;
8049 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050};
8051
8052static PyObject*
8053encoding_map_size(PyObject *obj, PyObject* args)
8054{
8055 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058}
8059
8060static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 PyDoc_STR("Return the size (in bytes) of this object") },
8063 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064};
8065
8066static void
8067encoding_map_dealloc(PyObject* o)
8068{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008069 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070}
8071
8072static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 "EncodingMap", /*tp_name*/
8075 sizeof(struct encoding_map), /*tp_basicsize*/
8076 0, /*tp_itemsize*/
8077 /* methods */
8078 encoding_map_dealloc, /*tp_dealloc*/
8079 0, /*tp_print*/
8080 0, /*tp_getattr*/
8081 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008082 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 0, /*tp_repr*/
8084 0, /*tp_as_number*/
8085 0, /*tp_as_sequence*/
8086 0, /*tp_as_mapping*/
8087 0, /*tp_hash*/
8088 0, /*tp_call*/
8089 0, /*tp_str*/
8090 0, /*tp_getattro*/
8091 0, /*tp_setattro*/
8092 0, /*tp_as_buffer*/
8093 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8094 0, /*tp_doc*/
8095 0, /*tp_traverse*/
8096 0, /*tp_clear*/
8097 0, /*tp_richcompare*/
8098 0, /*tp_weaklistoffset*/
8099 0, /*tp_iter*/
8100 0, /*tp_iternext*/
8101 encoding_map_methods, /*tp_methods*/
8102 0, /*tp_members*/
8103 0, /*tp_getset*/
8104 0, /*tp_base*/
8105 0, /*tp_dict*/
8106 0, /*tp_descr_get*/
8107 0, /*tp_descr_set*/
8108 0, /*tp_dictoffset*/
8109 0, /*tp_init*/
8110 0, /*tp_alloc*/
8111 0, /*tp_new*/
8112 0, /*tp_free*/
8113 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114};
8115
8116PyObject*
8117PyUnicode_BuildEncodingMap(PyObject* string)
8118{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 PyObject *result;
8120 struct encoding_map *mresult;
8121 int i;
8122 int need_dict = 0;
8123 unsigned char level1[32];
8124 unsigned char level2[512];
8125 unsigned char *mlevel1, *mlevel2, *mlevel3;
8126 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 int kind;
8128 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008129 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 PyErr_BadArgument();
8134 return NULL;
8135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 kind = PyUnicode_KIND(string);
8137 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 length = PyUnicode_GET_LENGTH(string);
8139 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 memset(level1, 0xFF, sizeof level1);
8141 memset(level2, 0xFF, sizeof level2);
8142
8143 /* If there isn't a one-to-one mapping of NULL to \0,
8144 or if there are non-BMP characters, we need to use
8145 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008148 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 ch = PyUnicode_READ(kind, data, i);
8151 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 need_dict = 1;
8153 break;
8154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 /* unmapped character */
8157 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 l1 = ch >> 11;
8159 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (level1[l1] == 0xFF)
8161 level1[l1] = count2++;
8162 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 }
8165
8166 if (count2 >= 0xFF || count3 >= 0xFF)
8167 need_dict = 1;
8168
8169 if (need_dict) {
8170 PyObject *result = PyDict_New();
8171 PyObject *key, *value;
8172 if (!result)
8173 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008174 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008176 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 if (!key || !value)
8178 goto failed1;
8179 if (PyDict_SetItem(result, key, value) == -1)
8180 goto failed1;
8181 Py_DECREF(key);
8182 Py_DECREF(value);
8183 }
8184 return result;
8185 failed1:
8186 Py_XDECREF(key);
8187 Py_XDECREF(value);
8188 Py_DECREF(result);
8189 return NULL;
8190 }
8191
8192 /* Create a three-level trie */
8193 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8194 16*count2 + 128*count3 - 1);
8195 if (!result)
8196 return PyErr_NoMemory();
8197 PyObject_Init(result, &EncodingMapType);
8198 mresult = (struct encoding_map*)result;
8199 mresult->count2 = count2;
8200 mresult->count3 = count3;
8201 mlevel1 = mresult->level1;
8202 mlevel2 = mresult->level23;
8203 mlevel3 = mresult->level23 + 16*count2;
8204 memcpy(mlevel1, level1, 32);
8205 memset(mlevel2, 0xFF, 16*count2);
8206 memset(mlevel3, 0, 128*count3);
8207 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008208 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8211 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212 /* unmapped character */
8213 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 o1 = ch>>11;
8215 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216 i2 = 16*mlevel1[o1] + o2;
8217 if (mlevel2[i2] == 0xFF)
8218 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008219 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 i3 = 128*mlevel2[i2] + o3;
8221 mlevel3[i3] = i;
8222 }
8223 return result;
8224}
8225
8226static int
Victor Stinner22168992011-11-20 17:09:18 +01008227encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228{
8229 struct encoding_map *map = (struct encoding_map*)mapping;
8230 int l1 = c>>11;
8231 int l2 = (c>>7) & 0xF;
8232 int l3 = c & 0x7F;
8233 int i;
8234
Victor Stinner22168992011-11-20 17:09:18 +01008235 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008237 if (c == 0)
8238 return 0;
8239 /* level 1*/
8240 i = map->level1[l1];
8241 if (i == 0xFF) {
8242 return -1;
8243 }
8244 /* level 2*/
8245 i = map->level23[16*i+l2];
8246 if (i == 0xFF) {
8247 return -1;
8248 }
8249 /* level 3 */
8250 i = map->level23[16*map->count2 + 128*i + l3];
8251 if (i == 0) {
8252 return -1;
8253 }
8254 return i;
8255}
8256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257/* Lookup the character ch in the mapping. If the character
8258 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008259 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008261charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262{
Christian Heimes217cfd12007-12-02 14:31:20 +00008263 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 PyObject *x;
8265
8266 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 x = PyObject_GetItem(mapping, w);
8269 Py_DECREF(w);
8270 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8272 /* No mapping found means: mapping is undefined. */
8273 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008274 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 } else
8276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008278 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 long value = PyLong_AS_LONG(x);
8282 if (value < 0 || value > 255) {
8283 PyErr_SetString(PyExc_TypeError,
8284 "character mapping must be in range(256)");
8285 Py_DECREF(x);
8286 return NULL;
8287 }
8288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 /* wrong return value */
8294 PyErr_Format(PyExc_TypeError,
8295 "character mapping must return integer, bytes or None, not %.400s",
8296 x->ob_type->tp_name);
8297 Py_DECREF(x);
8298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
8300}
8301
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008303charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8306 /* exponentially overallocate to minimize reallocations */
8307 if (requiredsize < 2*outsize)
8308 requiredsize = 2*outsize;
8309 if (_PyBytes_Resize(outobj, requiredsize))
8310 return -1;
8311 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312}
8313
Benjamin Peterson14339b62009-01-31 16:36:08 +00008314typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008316} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008318 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 space is available. Return a new reference to the object that
8320 was put in the output buffer, or Py_None, if the mapping was undefined
8321 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008322 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008324charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 PyObject *rep;
8328 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008329 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330
Christian Heimes90aa7642007-12-19 02:45:37 +00008331 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 if (res == -1)
8335 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize))
8338 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008339 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 outstart[(*outpos)++] = (char)res;
8341 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 }
8343
8344 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 Py_DECREF(rep);
8349 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 if (PyLong_Check(rep)) {
8352 Py_ssize_t requiredsize = *outpos+1;
8353 if (outsize<requiredsize)
8354 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8355 Py_DECREF(rep);
8356 return enc_EXCEPTION;
8357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008358 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 else {
8362 const char *repchars = PyBytes_AS_STRING(rep);
8363 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8364 Py_ssize_t requiredsize = *outpos+repsize;
8365 if (outsize<requiredsize)
8366 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8367 Py_DECREF(rep);
8368 return enc_EXCEPTION;
8369 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 memcpy(outstart + *outpos, repchars, repsize);
8372 *outpos += repsize;
8373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 Py_DECREF(rep);
8376 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377}
8378
8379/* handle an error in PyUnicode_EncodeCharmap
8380 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381static int
8382charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008385 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008386 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387{
8388 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008390 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008391 enum PyUnicode_Kind kind;
8392 void *data;
8393 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t collstartpos = *inpos;
8396 Py_ssize_t collendpos = *inpos+1;
8397 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008398 const char *encoding = "charmap";
8399 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008400 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008401 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008402 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403
Benjamin Petersonbac79492012-01-14 13:34:47 -05008404 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 return -1;
8406 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 /* find all unencodable characters */
8408 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008410 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008412 val = encoding_map_lookup(ch, mapping);
8413 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 break;
8415 ++collendpos;
8416 continue;
8417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8420 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (rep==NULL)
8422 return -1;
8423 else if (rep!=Py_None) {
8424 Py_DECREF(rep);
8425 break;
8426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 /* cache callback name lookup
8431 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008432 if (*error_handler == _Py_ERROR_UNKNOWN)
8433 *error_handler = get_error_handler(errors);
8434
8435 switch (*error_handler) {
8436 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008437 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008439
8440 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 x = charmapencode_output('?', mapping, res, respos);
8443 if (x==enc_EXCEPTION) {
8444 return -1;
8445 }
8446 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return -1;
8449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 }
8451 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008452 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 *inpos = collendpos;
8454 break;
Victor Stinner50149202015-09-22 00:26:54 +02008455
8456 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 /* generate replacement (temporarily (mis)uses p) */
8458 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 char buffer[2+29+1+1];
8460 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008461 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 for (cp = buffer; *cp; ++cp) {
8463 x = charmapencode_output(*cp, mapping, res, respos);
8464 if (x==enc_EXCEPTION)
8465 return -1;
8466 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
8469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 }
8471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 *inpos = collendpos;
8473 break;
Victor Stinner50149202015-09-22 00:26:54 +02008474
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 default:
Victor Stinner50149202015-09-22 00:26:54 +02008476 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008477 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008481 if (PyBytes_Check(repunicode)) {
8482 /* Directly copy bytes result to output. */
8483 Py_ssize_t outsize = PyBytes_Size(*res);
8484 Py_ssize_t requiredsize;
8485 repsize = PyBytes_Size(repunicode);
8486 requiredsize = *respos + repsize;
8487 if (requiredsize > outsize)
8488 /* Make room for all additional bytes. */
8489 if (charmapencode_resize(res, respos, requiredsize)) {
8490 Py_DECREF(repunicode);
8491 return -1;
8492 }
8493 memcpy(PyBytes_AsString(*res) + *respos,
8494 PyBytes_AsString(repunicode), repsize);
8495 *respos += repsize;
8496 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008497 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008498 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008501 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008502 Py_DECREF(repunicode);
8503 return -1;
8504 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008505 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008506 data = PyUnicode_DATA(repunicode);
8507 kind = PyUnicode_KIND(repunicode);
8508 for (index = 0; index < repsize; index++) {
8509 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8510 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
8514 }
8515 else if (x==enc_FAILED) {
8516 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008517 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 }
8521 *inpos = newpos;
8522 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 }
8524 return 0;
8525}
8526
Alexander Belopolsky40018472011-02-26 01:02:56 +00008527PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528_PyUnicode_EncodeCharmap(PyObject *unicode,
8529 PyObject *mapping,
8530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 /* output object */
8533 PyObject *res = NULL;
8534 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008539 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008541 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008542 void *data;
8543 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Benjamin Petersonbac79492012-01-14 13:34:47 -05008545 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 return NULL;
8547 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 data = PyUnicode_DATA(unicode);
8549 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 /* Default to Latin-1 */
8552 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 /* allocate enough for a simple encoding without
8556 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (res == NULL)
8559 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008560 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008564 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (x==enc_EXCEPTION) /* error */
8568 goto onError;
8569 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008572 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 &res, &respos)) {
8574 goto onError;
8575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 else
8578 /* done with this character => adjust input position */
8579 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008583 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008584 if (_PyBytes_Resize(&res, respos) < 0)
8585 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008588 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 return res;
8590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(res);
8593 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008594 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 return NULL;
8596}
8597
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008598/* Deprecated */
8599PyObject *
8600PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8601 Py_ssize_t size,
8602 PyObject *mapping,
8603 const char *errors)
8604{
8605 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008606 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 if (unicode == NULL)
8608 return NULL;
8609 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8610 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008611 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612}
8613
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614PyObject *
8615PyUnicode_AsCharmapString(PyObject *unicode,
8616 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617{
8618 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 PyErr_BadArgument();
8620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623}
8624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626static void
8627make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629 Py_ssize_t startpos, Py_ssize_t endpos,
8630 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 *exceptionObject = _PyUnicodeTranslateError_Create(
8634 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8638 goto onError;
8639 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8640 goto onError;
8641 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8642 goto onError;
8643 return;
8644 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008645 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
8647}
8648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649/* error handling callback helper:
8650 build arguments, call the callback and check the arguments,
8651 put the result into newpos and return the replacement string, which
8652 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008653static PyObject *
8654unicode_translate_call_errorhandler(const char *errors,
8655 PyObject **errorHandler,
8656 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658 Py_ssize_t startpos, Py_ssize_t endpos,
8659 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008661 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008663 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 PyObject *restuple;
8665 PyObject *resunicode;
8666
8667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 }
8672
8673 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008678 restuple = PyObject_CallFunctionObjArgs(
8679 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 Py_DECREF(restuple);
8685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008687 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 &resunicode, &i_newpos)) {
8689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 else
8695 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 Py_DECREF(restuple);
8699 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 Py_INCREF(resunicode);
8702 Py_DECREF(restuple);
8703 return resunicode;
8704}
8705
8706/* Lookup the character ch in the mapping and put the result in result,
8707 which must be decrefed by the caller.
8708 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008709static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711{
Christian Heimes217cfd12007-12-02 14:31:20 +00008712 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 PyObject *x;
8714
8715 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 x = PyObject_GetItem(mapping, w);
8718 Py_DECREF(w);
8719 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8721 /* No mapping found means: use 1:1 mapping. */
8722 PyErr_Clear();
8723 *result = NULL;
8724 return 0;
8725 } else
8726 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
8728 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 *result = x;
8730 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008732 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008734 if (value < 0 || value > MAX_UNICODE) {
8735 PyErr_Format(PyExc_ValueError,
8736 "character mapping must be in range(0x%x)",
8737 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 Py_DECREF(x);
8739 return -1;
8740 }
8741 *result = x;
8742 return 0;
8743 }
8744 else if (PyUnicode_Check(x)) {
8745 *result = x;
8746 return 0;
8747 }
8748 else {
8749 /* wrong return value */
8750 PyErr_SetString(PyExc_TypeError,
8751 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752 Py_DECREF(x);
8753 return -1;
8754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755}
Victor Stinner1194ea02014-04-04 19:37:40 +02008756
8757/* lookup the character, write the result into the writer.
8758 Return 1 if the result was written into the writer, return 0 if the mapping
8759 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008760static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008761charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8762 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763{
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 PyObject *item;
8765
8766 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008768
8769 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008776
8777 if (item == Py_None) {
8778 Py_DECREF(item);
8779 return 0;
8780 }
8781
8782 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008783 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8784 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8785 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008786 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8787 Py_DECREF(item);
8788 return -1;
8789 }
8790 Py_DECREF(item);
8791 return 1;
8792 }
8793
8794 if (!PyUnicode_Check(item)) {
8795 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008797 }
8798
8799 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8800 Py_DECREF(item);
8801 return -1;
8802 }
8803
8804 Py_DECREF(item);
8805 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806}
8807
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808static int
8809unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8810 Py_UCS1 *translate)
8811{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008812 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 int ret = 0;
8814
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 if (charmaptranslate_lookup(ch, mapping, &item)) {
8816 return -1;
8817 }
8818
8819 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008821 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008823 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 /* not found => default to 1:1 mapping */
8825 translate[ch] = ch;
8826 return 1;
8827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008829 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008830 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8831 used it */
8832 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 /* invalid character or character outside ASCII:
8834 skip the fast translate */
8835 goto exit;
8836 }
8837 translate[ch] = (Py_UCS1)replace;
8838 }
8839 else if (PyUnicode_Check(item)) {
8840 Py_UCS4 replace;
8841
8842 if (PyUnicode_READY(item) == -1) {
8843 Py_DECREF(item);
8844 return -1;
8845 }
8846 if (PyUnicode_GET_LENGTH(item) != 1)
8847 goto exit;
8848
8849 replace = PyUnicode_READ_CHAR(item, 0);
8850 if (replace > 127)
8851 goto exit;
8852 translate[ch] = (Py_UCS1)replace;
8853 }
8854 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008855 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 goto exit;
8857 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 ret = 1;
8859
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 exit:
8861 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 return ret;
8863}
8864
8865/* Fast path for ascii => ascii translation. Return 1 if the whole string
8866 was translated into writer, return 0 if the input string was partially
8867 translated into writer, raise an exception and return -1 on error. */
8868static int
8869unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008870 _PyUnicodeWriter *writer, int ignore,
8871 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872{
Victor Stinner872b2912014-04-05 14:27:07 +02008873 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 Py_ssize_t len;
8875 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008876 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878 len = PyUnicode_GET_LENGTH(input);
8879
Victor Stinner872b2912014-04-05 14:27:07 +02008880 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881
8882 in = PyUnicode_1BYTE_DATA(input);
8883 end = in + len;
8884
8885 assert(PyUnicode_IS_ASCII(writer->buffer));
8886 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8887 out = PyUnicode_1BYTE_DATA(writer->buffer);
8888
Victor Stinner872b2912014-04-05 14:27:07 +02008889 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008893 int translate = unicode_fast_translate_lookup(mapping, ch,
8894 ascii_table);
8895 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008897 if (translate == 0)
8898 goto exit;
8899 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 }
Victor Stinner872b2912014-04-05 14:27:07 +02008901 if (ch2 == 0xfe) {
8902 if (ignore)
8903 continue;
8904 goto exit;
8905 }
8906 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008908 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 }
Victor Stinner872b2912014-04-05 14:27:07 +02008910 res = 1;
8911
8912exit:
8913 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008914 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008915 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916}
8917
Victor Stinner3222da22015-10-01 22:07:32 +02008918static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919_PyUnicode_TranslateCharmap(PyObject *input,
8920 PyObject *mapping,
8921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 Py_ssize_t size, i;
8926 int kind;
8927 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008928 _PyUnicodeWriter writer;
8929 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008930 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 PyObject *errorHandler = NULL;
8932 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 PyErr_BadArgument();
8938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 if (PyUnicode_READY(input) == -1)
8942 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 kind = PyUnicode_KIND(input);
8945 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008947 if (size == 0)
8948 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950 /* allocate enough for a simple 1:1 translation without
8951 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008952 _PyUnicodeWriter_Init(&writer);
8953 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
Victor Stinner872b2912014-04-05 14:27:07 +02008956 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8957
Victor Stinner33798672016-03-01 21:59:58 +01008958 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008960 if (PyUnicode_IS_ASCII(input)) {
8961 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8962 if (res < 0) {
8963 _PyUnicodeWriter_Dealloc(&writer);
8964 return NULL;
8965 }
8966 if (res == 1)
8967 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 }
Victor Stinner33798672016-03-01 21:59:58 +01008969 else {
8970 i = 0;
8971 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008975 int translate;
8976 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8977 Py_ssize_t newpos;
8978 /* startpos for collecting untranslatable chars */
8979 Py_ssize_t collstart;
8980 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 ch = PyUnicode_READ(kind, data, i);
8984 translate = charmaptranslate_output(ch, mapping, &writer);
8985 if (translate < 0)
8986 goto onError;
8987
8988 if (translate != 0) {
8989 /* it worked => adjust input pointer */
8990 ++i;
8991 continue;
8992 }
8993
8994 /* untranslatable character */
8995 collstart = i;
8996 collend = i+1;
8997
8998 /* find all untranslatable characters */
8999 while (collend < size) {
9000 PyObject *x;
9001 ch = PyUnicode_READ(kind, data, collend);
9002 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009003 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 Py_XDECREF(x);
9005 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 ++collend;
9008 }
9009
9010 if (ignore) {
9011 i = collend;
9012 }
9013 else {
9014 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9015 reason, input, &exc,
9016 collstart, collend, &newpos);
9017 if (repunicode == NULL)
9018 goto onError;
9019 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 Py_DECREF(repunicode);
9024 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 }
9026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 Py_XDECREF(exc);
9028 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 Py_XDECREF(exc);
9034 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 return NULL;
9036}
9037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038/* Deprecated. Use PyUnicode_Translate instead. */
9039PyObject *
9040PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9041 Py_ssize_t size,
9042 PyObject *mapping,
9043 const char *errors)
9044{
Christian Heimes5f520f42012-09-11 14:03:25 +02009045 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009046 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (!unicode)
9048 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009049 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9050 Py_DECREF(unicode);
9051 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052}
9053
Alexander Belopolsky40018472011-02-26 01:02:56 +00009054PyObject *
9055PyUnicode_Translate(PyObject *str,
9056 PyObject *mapping,
9057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009059 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009060 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009061 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
Tim Petersced69f82003-09-16 20:30:58 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064PyObject *
9065_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9066{
9067 if (!PyUnicode_Check(unicode)) {
9068 PyErr_BadInternalCall();
9069 return NULL;
9070 }
9071 if (PyUnicode_READY(unicode) == -1)
9072 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009073 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 /* If the string is already ASCII, just return the same string */
9075 Py_INCREF(unicode);
9076 return unicode;
9077 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009078
9079 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9080 PyObject *result = PyUnicode_New(len, 127);
9081 if (result == NULL) {
9082 return NULL;
9083 }
9084
9085 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9086 int kind = PyUnicode_KIND(unicode);
9087 const void *data = PyUnicode_DATA(unicode);
9088 Py_ssize_t i;
9089 for (i = 0; i < len; ++i) {
9090 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9091 if (ch < 127) {
9092 out[i] = ch;
9093 }
9094 else if (Py_UNICODE_ISSPACE(ch)) {
9095 out[i] = ' ';
9096 }
9097 else {
9098 int decimal = Py_UNICODE_TODECIMAL(ch);
9099 if (decimal < 0) {
9100 out[i] = '?';
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009101 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009102 _PyUnicode_LENGTH(result) = i + 1;
9103 break;
9104 }
9105 out[i] = '0' + decimal;
9106 }
9107 }
9108
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009109 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009110 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111}
9112
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009113PyObject *
9114PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9115 Py_ssize_t length)
9116{
Victor Stinnerf0124502011-11-21 23:12:56 +01009117 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009119 Py_UCS4 maxchar;
9120 enum PyUnicode_Kind kind;
9121 void *data;
9122
Victor Stinner99d7ad02012-02-22 13:37:39 +01009123 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009124 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009125 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009126 if (ch > 127) {
9127 int decimal = Py_UNICODE_TODECIMAL(ch);
9128 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009129 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009130 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 }
9132 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009133
9134 /* Copy to a new string */
9135 decimal = PyUnicode_New(length, maxchar);
9136 if (decimal == NULL)
9137 return decimal;
9138 kind = PyUnicode_KIND(decimal);
9139 data = PyUnicode_DATA(decimal);
9140 /* Iterate over code points */
9141 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009142 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009143 if (ch > 127) {
9144 int decimal = Py_UNICODE_TODECIMAL(ch);
9145 if (decimal >= 0)
9146 ch = '0' + decimal;
9147 }
9148 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009150 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009151}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009152/* --- Decimal Encoder ---------------------------------------------------- */
9153
Alexander Belopolsky40018472011-02-26 01:02:56 +00009154int
9155PyUnicode_EncodeDecimal(Py_UNICODE *s,
9156 Py_ssize_t length,
9157 char *output,
9158 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009159{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009160 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009161 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009162 enum PyUnicode_Kind kind;
9163 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164
9165 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 PyErr_BadArgument();
9167 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009168 }
9169
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009170 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009171 if (unicode == NULL)
9172 return -1;
9173
Victor Stinner42bf7752011-11-21 22:52:58 +01009174 kind = PyUnicode_KIND(unicode);
9175 data = PyUnicode_DATA(unicode);
9176
Victor Stinnerb84d7232011-11-22 01:50:07 +01009177 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009178 PyObject *exc;
9179 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009181 Py_ssize_t startpos;
9182
9183 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009184
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009186 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009187 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 decimal = Py_UNICODE_TODECIMAL(ch);
9191 if (decimal >= 0) {
9192 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009193 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 continue;
9195 }
9196 if (0 < ch && ch < 256) {
9197 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
Victor Stinner6345be92011-11-25 20:09:01 +01009201
Victor Stinner42bf7752011-11-21 22:52:58 +01009202 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009203 exc = NULL;
9204 raise_encode_exception(&exc, "decimal", unicode,
9205 startpos, startpos+1,
9206 "invalid decimal Unicode string");
9207 Py_XDECREF(exc);
9208 Py_DECREF(unicode);
9209 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009210 }
9211 /* 0-terminate the output string */
9212 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009213 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009214 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215}
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217/* --- Helpers ------------------------------------------------------------ */
9218
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009219/* helper macro to fixup start/end slice values */
9220#define ADJUST_INDICES(start, end, len) \
9221 if (end > len) \
9222 end = len; \
9223 else if (end < 0) { \
9224 end += len; \
9225 if (end < 0) \
9226 end = 0; \
9227 } \
9228 if (start < 0) { \
9229 start += len; \
9230 if (start < 0) \
9231 start = 0; \
9232 }
9233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009235any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009237 Py_ssize_t end,
9238 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009240 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 void *buf1, *buf2;
9242 Py_ssize_t len1, len2, result;
9243
9244 kind1 = PyUnicode_KIND(s1);
9245 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009246 if (kind1 < kind2)
9247 return -1;
9248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 len1 = PyUnicode_GET_LENGTH(s1);
9250 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 ADJUST_INDICES(start, end, len1);
9252 if (end - start < len2)
9253 return -1;
9254
9255 buf1 = PyUnicode_DATA(s1);
9256 buf2 = PyUnicode_DATA(s2);
9257 if (len2 == 1) {
9258 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9259 result = findchar((const char *)buf1 + kind1*start,
9260 kind1, end - start, ch, direction);
9261 if (result == -1)
9262 return -1;
9263 else
9264 return start + result;
9265 }
9266
9267 if (kind2 != kind1) {
9268 buf2 = _PyUnicode_AsKind(s2, kind1);
9269 if (!buf2)
9270 return -2;
9271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272
Victor Stinner794d5672011-10-10 03:21:36 +02009273 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009274 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009275 case PyUnicode_1BYTE_KIND:
9276 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9277 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9278 else
9279 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9280 break;
9281 case PyUnicode_2BYTE_KIND:
9282 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9283 break;
9284 case PyUnicode_4BYTE_KIND:
9285 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009288 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009289 }
9290 }
9291 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009292 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009293 case PyUnicode_1BYTE_KIND:
9294 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9295 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9296 else
9297 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9298 break;
9299 case PyUnicode_2BYTE_KIND:
9300 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 break;
9302 case PyUnicode_4BYTE_KIND:
9303 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009306 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 }
9309
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009310 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 PyMem_Free(buf2);
9312
9313 return result;
9314}
9315
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009316/* _PyUnicode_InsertThousandsGrouping() helper functions */
9317#include "stringlib/localeutil.h"
9318
9319/**
9320 * InsertThousandsGrouping:
9321 * @writer: Unicode writer.
9322 * @n_buffer: Number of characters in @buffer.
9323 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9324 * @d_pos: Start of digits string.
9325 * @n_digits: The number of digits in the string, in which we want
9326 * to put the grouping chars.
9327 * @min_width: The minimum width of the digits in the output string.
9328 * Output will be zero-padded on the left to fill.
9329 * @grouping: see definition in localeconv().
9330 * @thousands_sep: see definition in localeconv().
9331 *
9332 * There are 2 modes: counting and filling. If @writer is NULL,
9333 * we are in counting mode, else filling mode.
9334 * If counting, the required buffer size is returned.
9335 * If filling, we know the buffer will be large enough, so we don't
9336 * need to pass in the buffer size.
9337 * Inserts thousand grouping characters (as defined by grouping and
9338 * thousands_sep) into @writer.
9339 *
9340 * Return value: -1 on error, number of characters otherwise.
9341 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009343_PyUnicode_InsertThousandsGrouping(
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009344 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009345 Py_ssize_t n_buffer,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009346 PyObject *digits,
9347 Py_ssize_t d_pos,
9348 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 Py_ssize_t min_width,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009350 const char *grouping,
9351 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009352 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353{
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009354 if (writer) {
9355 assert(digits != NULL);
9356 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358 else {
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009359 assert(digits == NULL);
9360 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009362 assert(0 <= d_pos);
9363 assert(0 <= n_digits);
9364 assert(0 <= min_width);
9365 assert(grouping != NULL);
9366
9367 if (digits != NULL) {
9368 if (PyUnicode_READY(digits) == -1) {
9369 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009370 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009371 }
9372 if (PyUnicode_READY(thousands_sep) == -1) {
9373 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 }
9375
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009376 Py_ssize_t count = 0;
9377 Py_ssize_t n_zeros;
9378 int loop_broken = 0;
9379 int use_separator = 0; /* First time through, don't append the
9380 separator. They only go between
9381 groups. */
9382 Py_ssize_t buffer_pos;
9383 Py_ssize_t digits_pos;
9384 Py_ssize_t len;
9385 Py_ssize_t n_chars;
9386 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9387 be looked at */
9388 /* A generator that returns all of the grouping widths, until it
9389 returns 0. */
9390 GroupGenerator groupgen;
9391 GroupGenerator_init(&groupgen, grouping);
9392 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9393
9394 /* if digits are not grouped, thousands separator
9395 should be an empty string */
9396 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9397
9398 digits_pos = d_pos + n_digits;
9399 if (writer) {
9400 buffer_pos = writer->pos + n_buffer;
9401 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9402 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009404 else {
9405 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009406 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009407
9408 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009409 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009410 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009411
9412 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9413 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9414 n_zeros = Py_MAX(0, len - remaining);
9415 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9416
9417 /* Use n_zero zero's and n_chars chars */
9418
9419 /* Count only, don't do anything. */
9420 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9421
9422 /* Copy into the writer. */
9423 InsertThousandsGrouping_fill(writer, &buffer_pos,
9424 digits, &digits_pos,
9425 n_chars, n_zeros,
9426 use_separator ? thousands_sep : NULL,
9427 thousands_sep_len, maxchar);
9428
9429 /* Use a separator next time. */
9430 use_separator = 1;
9431
9432 remaining -= n_chars;
9433 min_width -= len;
9434
9435 if (remaining <= 0 && min_width <= 0) {
9436 loop_broken = 1;
9437 break;
9438 }
9439 min_width -= thousands_sep_len;
9440 }
9441 if (!loop_broken) {
9442 /* We left the loop without using a break statement. */
9443
9444 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9445 n_zeros = Py_MAX(0, len - remaining);
9446 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9447
9448 /* Use n_zero zero's and n_chars chars */
9449 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9450
9451 /* Copy into the writer. */
9452 InsertThousandsGrouping_fill(writer, &buffer_pos,
9453 digits, &digits_pos,
9454 n_chars, n_zeros,
9455 use_separator ? thousands_sep : NULL,
9456 thousands_sep_len, maxchar);
9457 }
9458 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459}
9460
9461
Alexander Belopolsky40018472011-02-26 01:02:56 +00009462Py_ssize_t
9463PyUnicode_Count(PyObject *str,
9464 PyObject *substr,
9465 Py_ssize_t start,
9466 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009468 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 void *buf1 = NULL, *buf2 = NULL;
9471 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009472
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009473 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009475
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009476 kind1 = PyUnicode_KIND(str);
9477 kind2 = PyUnicode_KIND(substr);
9478 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009479 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009480
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009481 len1 = PyUnicode_GET_LENGTH(str);
9482 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 buf1 = PyUnicode_DATA(str);
9488 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009489 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009490 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009491 if (!buf2)
9492 goto onError;
9493 }
9494
9495 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009497 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009498 result = asciilib_count(
9499 ((Py_UCS1*)buf1) + start, end - start,
9500 buf2, len2, PY_SSIZE_T_MAX
9501 );
9502 else
9503 result = ucs1lib_count(
9504 ((Py_UCS1*)buf1) + start, end - start,
9505 buf2, len2, PY_SSIZE_T_MAX
9506 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 break;
9508 case PyUnicode_2BYTE_KIND:
9509 result = ucs2lib_count(
9510 ((Py_UCS2*)buf1) + start, end - start,
9511 buf2, len2, PY_SSIZE_T_MAX
9512 );
9513 break;
9514 case PyUnicode_4BYTE_KIND:
9515 result = ucs4lib_count(
9516 ((Py_UCS4*)buf1) + start, end - start,
9517 buf2, len2, PY_SSIZE_T_MAX
9518 );
9519 break;
9520 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009521 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009523
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009524 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 PyMem_Free(buf2);
9526
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 PyMem_Free(buf2);
9531 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532}
9533
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534Py_ssize_t
9535PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009536 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009537 Py_ssize_t start,
9538 Py_ssize_t end,
9539 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009541 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009543
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009544 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545}
9546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547Py_ssize_t
9548PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9549 Py_ssize_t start, Py_ssize_t end,
9550 int direction)
9551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009553 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 if (PyUnicode_READY(str) == -1)
9555 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009556 len = PyUnicode_GET_LENGTH(str);
9557 ADJUST_INDICES(start, end, len);
9558 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009559 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009561 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9562 kind, end-start, ch, direction);
9563 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009565 else
9566 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567}
9568
Alexander Belopolsky40018472011-02-26 01:02:56 +00009569static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009570tailmatch(PyObject *self,
9571 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009572 Py_ssize_t start,
9573 Py_ssize_t end,
9574 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 int kind_self;
9577 int kind_sub;
9578 void *data_self;
9579 void *data_sub;
9580 Py_ssize_t offset;
9581 Py_ssize_t i;
9582 Py_ssize_t end_sub;
9583
9584 if (PyUnicode_READY(self) == -1 ||
9585 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009586 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9589 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009593 if (PyUnicode_GET_LENGTH(substring) == 0)
9594 return 1;
9595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 kind_self = PyUnicode_KIND(self);
9597 data_self = PyUnicode_DATA(self);
9598 kind_sub = PyUnicode_KIND(substring);
9599 data_sub = PyUnicode_DATA(substring);
9600 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9601
9602 if (direction > 0)
9603 offset = end;
9604 else
9605 offset = start;
9606
9607 if (PyUnicode_READ(kind_self, data_self, offset) ==
9608 PyUnicode_READ(kind_sub, data_sub, 0) &&
9609 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9610 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9611 /* If both are of the same kind, memcmp is sufficient */
9612 if (kind_self == kind_sub) {
9613 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009614 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 data_sub,
9616 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009617 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009619 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 else {
9621 /* We do not need to compare 0 and len(substring)-1 because
9622 the if statement above ensured already that they are equal
9623 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 for (i = 1; i < end_sub; ++i) {
9625 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9626 PyUnicode_READ(kind_sub, data_sub, i))
9627 return 0;
9628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 }
9632
9633 return 0;
9634}
9635
Alexander Belopolsky40018472011-02-26 01:02:56 +00009636Py_ssize_t
9637PyUnicode_Tailmatch(PyObject *str,
9638 PyObject *substr,
9639 Py_ssize_t start,
9640 Py_ssize_t end,
9641 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009643 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009645
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009646 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647}
9648
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009649static PyObject *
9650ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9653 char *resdata, *data = PyUnicode_DATA(self);
9654 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009655
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 res = PyUnicode_New(len, 127);
9657 if (res == NULL)
9658 return NULL;
9659 resdata = PyUnicode_DATA(res);
9660 if (lower)
9661 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 _Py_bytes_upper(resdata, data, len);
9664 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665}
9666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009670 Py_ssize_t j;
9671 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009672 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009674
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9676
9677 where ! is a negation and \p{xxx} is a character with property xxx.
9678 */
9679 for (j = i - 1; j >= 0; j--) {
9680 c = PyUnicode_READ(kind, data, j);
9681 if (!_PyUnicode_IsCaseIgnorable(c))
9682 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9685 if (final_sigma) {
9686 for (j = i + 1; j < length; j++) {
9687 c = PyUnicode_READ(kind, data, j);
9688 if (!_PyUnicode_IsCaseIgnorable(c))
9689 break;
9690 }
9691 final_sigma = j == length || !_PyUnicode_IsCased(c);
9692 }
9693 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696static int
9697lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9698 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 /* Obscure special case. */
9701 if (c == 0x3A3) {
9702 mapped[0] = handle_capital_sigma(kind, data, length, i);
9703 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706}
9707
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708static Py_ssize_t
9709do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 Py_ssize_t i, k = 0;
9712 int n_res, j;
9713 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009714
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 c = PyUnicode_READ(kind, data, 0);
9716 n_res = _PyUnicode_ToUpperFull(c, mapped);
9717 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009718 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 for (i = 1; i < length; i++) {
9722 c = PyUnicode_READ(kind, data, i);
9723 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9724 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009725 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009727 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009728 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732static Py_ssize_t
9733do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9734 Py_ssize_t i, k = 0;
9735
9736 for (i = 0; i < length; i++) {
9737 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9738 int n_res, j;
9739 if (Py_UNICODE_ISUPPER(c)) {
9740 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9741 }
9742 else if (Py_UNICODE_ISLOWER(c)) {
9743 n_res = _PyUnicode_ToUpperFull(c, mapped);
9744 }
9745 else {
9746 n_res = 1;
9747 mapped[0] = c;
9748 }
9749 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009750 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 res[k++] = mapped[j];
9752 }
9753 }
9754 return k;
9755}
9756
9757static Py_ssize_t
9758do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9759 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 Py_ssize_t i, k = 0;
9762
9763 for (i = 0; i < length; i++) {
9764 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9765 int n_res, j;
9766 if (lower)
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 else
9769 n_res = _PyUnicode_ToUpperFull(c, mapped);
9770 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009771 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 res[k++] = mapped[j];
9773 }
9774 }
9775 return k;
9776}
9777
9778static Py_ssize_t
9779do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780{
9781 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9782}
9783
9784static Py_ssize_t
9785do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9786{
9787 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9788}
9789
Benjamin Petersone51757f2012-01-12 21:10:29 -05009790static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009791do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 Py_ssize_t i, k = 0;
9794
9795 for (i = 0; i < length; i++) {
9796 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9797 Py_UCS4 mapped[3];
9798 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9799 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009800 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009801 res[k++] = mapped[j];
9802 }
9803 }
9804 return k;
9805}
9806
9807static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009808do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9809{
9810 Py_ssize_t i, k = 0;
9811 int previous_is_cased;
9812
9813 previous_is_cased = 0;
9814 for (i = 0; i < length; i++) {
9815 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9816 Py_UCS4 mapped[3];
9817 int n_res, j;
9818
9819 if (previous_is_cased)
9820 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9821 else
9822 n_res = _PyUnicode_ToTitleFull(c, mapped);
9823
9824 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009825 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009826 res[k++] = mapped[j];
9827 }
9828
9829 previous_is_cased = _PyUnicode_IsCased(c);
9830 }
9831 return k;
9832}
9833
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834static PyObject *
9835case_operation(PyObject *self,
9836 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9837{
9838 PyObject *res = NULL;
9839 Py_ssize_t length, newlength = 0;
9840 int kind, outkind;
9841 void *data, *outdata;
9842 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9843
Benjamin Petersoneea48462012-01-16 14:28:50 -05009844 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845
9846 kind = PyUnicode_KIND(self);
9847 data = PyUnicode_DATA(self);
9848 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009849 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009850 PyErr_SetString(PyExc_OverflowError, "string is too long");
9851 return NULL;
9852 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009853 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009854 if (tmp == NULL)
9855 return PyErr_NoMemory();
9856 newlength = perform(kind, data, length, tmp, &maxchar);
9857 res = PyUnicode_New(newlength, maxchar);
9858 if (res == NULL)
9859 goto leave;
9860 tmpend = tmp + newlength;
9861 outdata = PyUnicode_DATA(res);
9862 outkind = PyUnicode_KIND(res);
9863 switch (outkind) {
9864 case PyUnicode_1BYTE_KIND:
9865 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9866 break;
9867 case PyUnicode_2BYTE_KIND:
9868 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9869 break;
9870 case PyUnicode_4BYTE_KIND:
9871 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9872 break;
9873 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009874 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 }
9876 leave:
9877 PyMem_FREE(tmp);
9878 return res;
9879}
9880
Tim Peters8ce9f162004-08-27 01:49:32 +00009881PyObject *
9882PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009884 PyObject *res;
9885 PyObject *fseq;
9886 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009887 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009889 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009892 }
9893
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009894 /* NOTE: the following code can't call back into Python code,
9895 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009896 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009897
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009898 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009899 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009900 res = _PyUnicode_JoinArray(separator, items, seqlen);
9901 Py_DECREF(fseq);
9902 return res;
9903}
9904
9905PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009906_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009907{
9908 PyObject *res = NULL; /* the result */
9909 PyObject *sep = NULL;
9910 Py_ssize_t seplen;
9911 PyObject *item;
9912 Py_ssize_t sz, i, res_offset;
9913 Py_UCS4 maxchar;
9914 Py_UCS4 item_maxchar;
9915 int use_memcpy;
9916 unsigned char *res_data = NULL, *sep_data = NULL;
9917 PyObject *last_obj;
9918 unsigned int kind = 0;
9919
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 /* If empty sequence, return u"". */
9921 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009922 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009923 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009926 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009927 if (seqlen == 1) {
9928 if (PyUnicode_CheckExact(items[0])) {
9929 res = items[0];
9930 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009931 return res;
9932 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009934 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009935 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 /* Set up sep and seplen */
9938 if (separator == NULL) {
9939 /* fall back to a blank space separator */
9940 sep = PyUnicode_FromOrdinal(' ');
9941 if (!sep)
9942 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009943 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009944 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009945 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 else {
9947 if (!PyUnicode_Check(separator)) {
9948 PyErr_Format(PyExc_TypeError,
9949 "separator: expected str instance,"
9950 " %.80s found",
9951 Py_TYPE(separator)->tp_name);
9952 goto onError;
9953 }
9954 if (PyUnicode_READY(separator))
9955 goto onError;
9956 sep = separator;
9957 seplen = PyUnicode_GET_LENGTH(separator);
9958 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9959 /* inc refcount to keep this code path symmetric with the
9960 above case of a blank separator */
9961 Py_INCREF(sep);
9962 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009964 }
9965
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 /* There are at least two things to join, or else we have a subclass
9967 * of str in the sequence.
9968 * Do a pre-pass to figure out the total amount of space we'll
9969 * need (sz), and see whether all argument are strings.
9970 */
9971 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009972#ifdef Py_DEBUG
9973 use_memcpy = 0;
9974#else
9975 use_memcpy = 1;
9976#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009978 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 if (!PyUnicode_Check(item)) {
9981 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009982 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 " %.80s found",
9984 i, Py_TYPE(item)->tp_name);
9985 goto onError;
9986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 if (PyUnicode_READY(item) == -1)
9988 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009989 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009991 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009992 if (i != 0) {
9993 add_sz += seplen;
9994 }
9995 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 goto onError;
9999 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010000 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 if (use_memcpy && last_obj != NULL) {
10002 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10003 use_memcpy = 0;
10004 }
10005 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 }
Tim Petersced69f82003-09-16 20:30:58 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 if (res == NULL)
10010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010011
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 if (use_memcpy) {
10017 res_data = PyUnicode_1BYTE_DATA(res);
10018 kind = PyUnicode_KIND(res);
10019 if (seplen != 0)
10020 sep_data = PyUnicode_1BYTE_DATA(sep);
10021 }
10022#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010023 if (use_memcpy) {
10024 for (i = 0; i < seqlen; ++i) {
10025 Py_ssize_t itemlen;
10026 item = items[i];
10027
10028 /* Copy item, and maybe the separator. */
10029 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010030 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 kind * seplen);
10033 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010034 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010035
10036 itemlen = PyUnicode_GET_LENGTH(item);
10037 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010038 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010040 kind * itemlen);
10041 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010043 }
10044 assert(res_data == PyUnicode_1BYTE_DATA(res)
10045 + kind * PyUnicode_GET_LENGTH(res));
10046 }
10047 else {
10048 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10049 Py_ssize_t itemlen;
10050 item = items[i];
10051
10052 /* Copy item, and maybe the separator. */
10053 if (i && seplen != 0) {
10054 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10055 res_offset += seplen;
10056 }
10057
10058 itemlen = PyUnicode_GET_LENGTH(item);
10059 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010060 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010061 res_offset += itemlen;
10062 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010063 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010065 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010068 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010073 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 return NULL;
10075}
10076
Victor Stinnerd3f08822012-05-29 12:57:52 +020010077void
10078_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10079 Py_UCS4 fill_char)
10080{
10081 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10082 const void *data = PyUnicode_DATA(unicode);
10083 assert(PyUnicode_IS_READY(unicode));
10084 assert(unicode_modifiable(unicode));
10085 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10086 assert(start >= 0);
10087 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10088 FILL(kind, data, fill_char, start, length);
10089}
10090
Victor Stinner3fe55312012-01-04 00:33:50 +010010091Py_ssize_t
10092PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10093 Py_UCS4 fill_char)
10094{
10095 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010096
10097 if (!PyUnicode_Check(unicode)) {
10098 PyErr_BadInternalCall();
10099 return -1;
10100 }
10101 if (PyUnicode_READY(unicode) == -1)
10102 return -1;
10103 if (unicode_check_modifiable(unicode))
10104 return -1;
10105
Victor Stinnerd3f08822012-05-29 12:57:52 +020010106 if (start < 0) {
10107 PyErr_SetString(PyExc_IndexError, "string index out of range");
10108 return -1;
10109 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010110 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10111 PyErr_SetString(PyExc_ValueError,
10112 "fill character is bigger than "
10113 "the string maximum character");
10114 return -1;
10115 }
10116
10117 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10118 length = Py_MIN(maxlen, length);
10119 if (length <= 0)
10120 return 0;
10121
Victor Stinnerd3f08822012-05-29 12:57:52 +020010122 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010123 return length;
10124}
10125
Victor Stinner9310abb2011-10-05 00:59:23 +020010126static PyObject *
10127pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010128 Py_ssize_t left,
10129 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 PyObject *u;
10133 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010134 int kind;
10135 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
10137 if (left < 0)
10138 left = 0;
10139 if (right < 0)
10140 right = 0;
10141
Victor Stinnerc4b49542011-12-11 22:44:26 +010010142 if (left == 0 && right == 0)
10143 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10146 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010147 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10148 return NULL;
10149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010151 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010153 if (!u)
10154 return NULL;
10155
10156 kind = PyUnicode_KIND(u);
10157 data = PyUnicode_DATA(u);
10158 if (left)
10159 FILL(kind, data, fill, 0, left);
10160 if (right)
10161 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010162 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010163 assert(_PyUnicode_CheckConsistency(u, 1));
10164 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165}
10166
Alexander Belopolsky40018472011-02-26 01:02:56 +000010167PyObject *
10168PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010172 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Benjamin Petersonead6b532011-12-20 17:23:42 -060010175 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010177 if (PyUnicode_IS_ASCII(string))
10178 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 PyUnicode_GET_LENGTH(string), keepends);
10181 else
10182 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010183 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_2BYTE_KIND:
10187 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 PyUnicode_GET_LENGTH(string), keepends);
10190 break;
10191 case PyUnicode_4BYTE_KIND:
10192 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 PyUnicode_GET_LENGTH(string), keepends);
10195 break;
10196 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010197 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200}
10201
Alexander Belopolsky40018472011-02-26 01:02:56 +000010202static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010203split(PyObject *self,
10204 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010205 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010207 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 void *buf1, *buf2;
10209 Py_ssize_t len1, len2;
10210 PyObject* out;
10211
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010213 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (PyUnicode_READY(self) == -1)
10216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010219 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 if (PyUnicode_IS_ASCII(self))
10222 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 PyUnicode_GET_LENGTH(self), maxcount
10225 );
10226 else
10227 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010228 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 PyUnicode_GET_LENGTH(self), maxcount
10230 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 case PyUnicode_2BYTE_KIND:
10232 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 PyUnicode_GET_LENGTH(self), maxcount
10235 );
10236 case PyUnicode_4BYTE_KIND:
10237 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 PyUnicode_GET_LENGTH(self), maxcount
10240 );
10241 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010242 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 }
10244
10245 if (PyUnicode_READY(substring) == -1)
10246 return NULL;
10247
10248 kind1 = PyUnicode_KIND(self);
10249 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 len1 = PyUnicode_GET_LENGTH(self);
10251 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010252 if (kind1 < kind2 || len1 < len2) {
10253 out = PyList_New(1);
10254 if (out == NULL)
10255 return NULL;
10256 Py_INCREF(self);
10257 PyList_SET_ITEM(out, 0, self);
10258 return out;
10259 }
10260 buf1 = PyUnicode_DATA(self);
10261 buf2 = PyUnicode_DATA(substring);
10262 if (kind2 != kind1) {
10263 buf2 = _PyUnicode_AsKind(substring, kind1);
10264 if (!buf2)
10265 return NULL;
10266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010268 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10271 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 else
10274 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010275 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 break;
10277 case PyUnicode_2BYTE_KIND:
10278 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 break;
10281 case PyUnicode_4BYTE_KIND:
10282 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 break;
10285 default:
10286 out = NULL;
10287 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010288 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 PyMem_Free(buf2);
10290 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291}
10292
Alexander Belopolsky40018472011-02-26 01:02:56 +000010293static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010294rsplit(PyObject *self,
10295 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010296 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010297{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010298 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 void *buf1, *buf2;
10300 Py_ssize_t len1, len2;
10301 PyObject* out;
10302
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010303 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010304 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (PyUnicode_READY(self) == -1)
10307 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010310 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010312 if (PyUnicode_IS_ASCII(self))
10313 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010315 PyUnicode_GET_LENGTH(self), maxcount
10316 );
10317 else
10318 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 PyUnicode_GET_LENGTH(self), maxcount
10321 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 case PyUnicode_2BYTE_KIND:
10323 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 PyUnicode_GET_LENGTH(self), maxcount
10326 );
10327 case PyUnicode_4BYTE_KIND:
10328 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010333 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 }
10335
10336 if (PyUnicode_READY(substring) == -1)
10337 return NULL;
10338
10339 kind1 = PyUnicode_KIND(self);
10340 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 len1 = PyUnicode_GET_LENGTH(self);
10342 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010343 if (kind1 < kind2 || len1 < len2) {
10344 out = PyList_New(1);
10345 if (out == NULL)
10346 return NULL;
10347 Py_INCREF(self);
10348 PyList_SET_ITEM(out, 0, self);
10349 return out;
10350 }
10351 buf1 = PyUnicode_DATA(self);
10352 buf2 = PyUnicode_DATA(substring);
10353 if (kind2 != kind1) {
10354 buf2 = _PyUnicode_AsKind(substring, kind1);
10355 if (!buf2)
10356 return NULL;
10357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10362 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010364 else
10365 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010366 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 break;
10368 case PyUnicode_2BYTE_KIND:
10369 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010370 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 break;
10372 case PyUnicode_4BYTE_KIND:
10373 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 break;
10376 default:
10377 out = NULL;
10378 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 PyMem_Free(buf2);
10381 return out;
10382}
10383
10384static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10386 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010388 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10391 return asciilib_find(buf1, len1, buf2, len2, offset);
10392 else
10393 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 case PyUnicode_2BYTE_KIND:
10395 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10396 case PyUnicode_4BYTE_KIND:
10397 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10398 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010399 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400}
10401
10402static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10404 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010406 switch (kind) {
10407 case PyUnicode_1BYTE_KIND:
10408 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10409 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10410 else
10411 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10412 case PyUnicode_2BYTE_KIND:
10413 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10414 case PyUnicode_4BYTE_KIND:
10415 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10416 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010417 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010418}
10419
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010420static void
10421replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10422 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10423{
10424 int kind = PyUnicode_KIND(u);
10425 void *data = PyUnicode_DATA(u);
10426 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10427 if (kind == PyUnicode_1BYTE_KIND) {
10428 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10429 (Py_UCS1 *)data + len,
10430 u1, u2, maxcount);
10431 }
10432 else if (kind == PyUnicode_2BYTE_KIND) {
10433 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10434 (Py_UCS2 *)data + len,
10435 u1, u2, maxcount);
10436 }
10437 else {
10438 assert(kind == PyUnicode_4BYTE_KIND);
10439 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10440 (Py_UCS4 *)data + len,
10441 u1, u2, maxcount);
10442 }
10443}
10444
Alexander Belopolsky40018472011-02-26 01:02:56 +000010445static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446replace(PyObject *self, PyObject *str1,
10447 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 PyObject *u;
10450 char *sbuf = PyUnicode_DATA(self);
10451 char *buf1 = PyUnicode_DATA(str1);
10452 char *buf2 = PyUnicode_DATA(str2);
10453 int srelease = 0, release1 = 0, release2 = 0;
10454 int skind = PyUnicode_KIND(self);
10455 int kind1 = PyUnicode_KIND(str1);
10456 int kind2 = PyUnicode_KIND(str2);
10457 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10458 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10459 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010460 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462
10463 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010466 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467
Victor Stinner59de0ee2011-10-07 10:01:28 +020010468 if (str1 == str2)
10469 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010472 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10473 if (maxchar < maxchar_str1)
10474 /* substring too wide to be present */
10475 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010476 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10477 /* Replacing str1 with str2 may cause a maxchar reduction in the
10478 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010479 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010480 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010487 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010488 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010489 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010490
Victor Stinner69ed0f42013-04-09 21:48:24 +020010491 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010492 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010493 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010495 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010499
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10501 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 }
10503 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 int rkind = skind;
10505 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010506 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (kind1 < rkind) {
10509 /* widen substring */
10510 buf1 = _PyUnicode_AsKind(str1, rkind);
10511 if (!buf1) goto error;
10512 release1 = 1;
10513 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010514 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 if (i < 0)
10516 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 if (rkind > kind2) {
10518 /* widen replacement */
10519 buf2 = _PyUnicode_AsKind(str2, rkind);
10520 if (!buf2) goto error;
10521 release2 = 1;
10522 }
10523 else if (rkind < kind2) {
10524 /* widen self and buf1 */
10525 rkind = kind2;
10526 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010527 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 sbuf = _PyUnicode_AsKind(self, rkind);
10529 if (!sbuf) goto error;
10530 srelease = 1;
10531 buf1 = _PyUnicode_AsKind(str1, rkind);
10532 if (!buf1) goto error;
10533 release1 = 1;
10534 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 u = PyUnicode_New(slen, maxchar);
10536 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 assert(PyUnicode_KIND(u) == rkind);
10539 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010540
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010541 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010542 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010543 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010545 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010547
10548 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010549 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010550 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010552 if (i == -1)
10553 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010554 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010560 }
10561 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010563 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 int rkind = skind;
10565 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 buf1 = _PyUnicode_AsKind(str1, rkind);
10570 if (!buf1) goto error;
10571 release1 = 1;
10572 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010573 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010574 if (n == 0)
10575 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010577 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 buf2 = _PyUnicode_AsKind(str2, rkind);
10579 if (!buf2) goto error;
10580 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 rkind = kind2;
10585 sbuf = _PyUnicode_AsKind(self, rkind);
10586 if (!sbuf) goto error;
10587 srelease = 1;
10588 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010589 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 buf1 = _PyUnicode_AsKind(str1, rkind);
10591 if (!buf1) goto error;
10592 release1 = 1;
10593 }
10594 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10595 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010596 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 PyErr_SetString(PyExc_OverflowError,
10598 "replace string is too long");
10599 goto error;
10600 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010601 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010602 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010603 _Py_INCREF_UNICODE_EMPTY();
10604 if (!unicode_empty)
10605 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 u = unicode_empty;
10607 goto done;
10608 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010609 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 PyErr_SetString(PyExc_OverflowError,
10611 "replace string is too long");
10612 goto error;
10613 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 u = PyUnicode_New(new_size, maxchar);
10615 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 assert(PyUnicode_KIND(u) == rkind);
10618 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 ires = i = 0;
10620 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 while (n-- > 0) {
10622 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010623 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010626 if (j == -1)
10627 break;
10628 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 memcpy(res + rkind * ires,
10631 sbuf + rkind * i,
10632 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 }
10635 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010637 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010646 memcpy(res + rkind * ires,
10647 sbuf + rkind * i,
10648 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 }
10650 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 /* interleave */
10652 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 if (--n <= 0)
10658 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
10660 sbuf + rkind * i,
10661 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires++;
10663 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 memcpy(res + rkind * ires,
10666 sbuf + rkind * i,
10667 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 }
10670
10671 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010672 unicode_adjust_maxchar(&u);
10673 if (u == NULL)
10674 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010676
10677 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (srelease)
10679 PyMem_FREE(sbuf);
10680 if (release1)
10681 PyMem_FREE(buf1);
10682 if (release2)
10683 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010684 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686
Benjamin Peterson29060642009-01-31 22:14:21 +000010687 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (srelease)
10690 PyMem_FREE(sbuf);
10691 if (release1)
10692 PyMem_FREE(buf1);
10693 if (release2)
10694 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010695 return unicode_result_unchanged(self);
10696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 error:
10698 if (srelease && sbuf)
10699 PyMem_FREE(sbuf);
10700 if (release1 && buf1)
10701 PyMem_FREE(buf1);
10702 if (release2 && buf2)
10703 PyMem_FREE(buf2);
10704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
10707/* --- Unicode Object Methods --------------------------------------------- */
10708
INADA Naoki3ae20562017-01-16 20:41:20 +090010709/*[clinic input]
10710str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
INADA Naoki3ae20562017-01-16 20:41:20 +090010712Return a version of the string where each word is titlecased.
10713
10714More specifically, words start with uppercased characters and all remaining
10715cased characters have lower case.
10716[clinic start generated code]*/
10717
10718static PyObject *
10719unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010720/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010722 if (PyUnicode_READY(self) == -1)
10723 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010724 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725}
10726
INADA Naoki3ae20562017-01-16 20:41:20 +090010727/*[clinic input]
10728str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
INADA Naoki3ae20562017-01-16 20:41:20 +090010730Return a capitalized version of the string.
10731
10732More specifically, make the first character have upper case and the rest lower
10733case.
10734[clinic start generated code]*/
10735
10736static PyObject *
10737unicode_capitalize_impl(PyObject *self)
10738/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010740 if (PyUnicode_READY(self) == -1)
10741 return NULL;
10742 if (PyUnicode_GET_LENGTH(self) == 0)
10743 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010744 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745}
10746
INADA Naoki3ae20562017-01-16 20:41:20 +090010747/*[clinic input]
10748str.casefold as unicode_casefold
10749
10750Return a version of the string suitable for caseless comparisons.
10751[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010752
10753static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010754unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010755/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010756{
10757 if (PyUnicode_READY(self) == -1)
10758 return NULL;
10759 if (PyUnicode_IS_ASCII(self))
10760 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010761 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010762}
10763
10764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010765/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010766
10767static int
10768convert_uc(PyObject *obj, void *addr)
10769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010772 if (!PyUnicode_Check(obj)) {
10773 PyErr_Format(PyExc_TypeError,
10774 "The fill character must be a unicode character, "
10775 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010776 return 0;
10777 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010778 if (PyUnicode_READY(obj) < 0)
10779 return 0;
10780 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010783 return 0;
10784 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010785 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010786 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010787}
10788
INADA Naoki3ae20562017-01-16 20:41:20 +090010789/*[clinic input]
10790str.center as unicode_center
10791
10792 width: Py_ssize_t
10793 fillchar: Py_UCS4 = ' '
10794 /
10795
10796Return a centered string of length width.
10797
10798Padding is done using the specified fill character (default is a space).
10799[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800
10801static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010802unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10803/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010805 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806
Benjamin Petersonbac79492012-01-14 13:34:47 -050010807 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808 return NULL;
10809
Victor Stinnerc4b49542011-12-11 22:44:26 +010010810 if (PyUnicode_GET_LENGTH(self) >= width)
10811 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
Victor Stinnerc4b49542011-12-11 22:44:26 +010010813 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 left = marg / 2 + (marg & width & 1);
10815
Victor Stinner9310abb2011-10-05 00:59:23 +020010816 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817}
10818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819/* This function assumes that str1 and str2 are readied by the caller. */
10820
Marc-André Lemburge5034372000-08-08 08:04:29 +000010821static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010822unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010823{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010824#define COMPARE(TYPE1, TYPE2) \
10825 do { \
10826 TYPE1* p1 = (TYPE1 *)data1; \
10827 TYPE2* p2 = (TYPE2 *)data2; \
10828 TYPE1* end = p1 + len; \
10829 Py_UCS4 c1, c2; \
10830 for (; p1 != end; p1++, p2++) { \
10831 c1 = *p1; \
10832 c2 = *p2; \
10833 if (c1 != c2) \
10834 return (c1 < c2) ? -1 : 1; \
10835 } \
10836 } \
10837 while (0)
10838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 int kind1, kind2;
10840 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010841 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 kind1 = PyUnicode_KIND(str1);
10844 kind2 = PyUnicode_KIND(str2);
10845 data1 = PyUnicode_DATA(str1);
10846 data2 = PyUnicode_DATA(str2);
10847 len1 = PyUnicode_GET_LENGTH(str1);
10848 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010849 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851 switch(kind1) {
10852 case PyUnicode_1BYTE_KIND:
10853 {
10854 switch(kind2) {
10855 case PyUnicode_1BYTE_KIND:
10856 {
10857 int cmp = memcmp(data1, data2, len);
10858 /* normalize result of memcmp() into the range [-1; 1] */
10859 if (cmp < 0)
10860 return -1;
10861 if (cmp > 0)
10862 return 1;
10863 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010864 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010865 case PyUnicode_2BYTE_KIND:
10866 COMPARE(Py_UCS1, Py_UCS2);
10867 break;
10868 case PyUnicode_4BYTE_KIND:
10869 COMPARE(Py_UCS1, Py_UCS4);
10870 break;
10871 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010872 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010873 }
10874 break;
10875 }
10876 case PyUnicode_2BYTE_KIND:
10877 {
10878 switch(kind2) {
10879 case PyUnicode_1BYTE_KIND:
10880 COMPARE(Py_UCS2, Py_UCS1);
10881 break;
10882 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010883 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 COMPARE(Py_UCS2, Py_UCS2);
10885 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 case PyUnicode_4BYTE_KIND:
10888 COMPARE(Py_UCS2, Py_UCS4);
10889 break;
10890 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010891 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 }
10893 break;
10894 }
10895 case PyUnicode_4BYTE_KIND:
10896 {
10897 switch(kind2) {
10898 case PyUnicode_1BYTE_KIND:
10899 COMPARE(Py_UCS4, Py_UCS1);
10900 break;
10901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS4, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010905 {
10906#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10907 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10908 /* normalize result of wmemcmp() into the range [-1; 1] */
10909 if (cmp < 0)
10910 return -1;
10911 if (cmp > 0)
10912 return 1;
10913#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010915#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010916 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010917 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010919 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 }
10921 break;
10922 }
10923 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010924 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010925 }
10926
Victor Stinner770e19e2012-10-04 22:59:45 +020010927 if (len1 == len2)
10928 return 0;
10929 if (len1 < len2)
10930 return -1;
10931 else
10932 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933
10934#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010935}
10936
Benjamin Peterson621b4302016-09-09 13:54:34 -070010937static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010938unicode_compare_eq(PyObject *str1, PyObject *str2)
10939{
10940 int kind;
10941 void *data1, *data2;
10942 Py_ssize_t len;
10943 int cmp;
10944
Victor Stinnere5567ad2012-10-23 02:48:49 +020010945 len = PyUnicode_GET_LENGTH(str1);
10946 if (PyUnicode_GET_LENGTH(str2) != len)
10947 return 0;
10948 kind = PyUnicode_KIND(str1);
10949 if (PyUnicode_KIND(str2) != kind)
10950 return 0;
10951 data1 = PyUnicode_DATA(str1);
10952 data2 = PyUnicode_DATA(str2);
10953
10954 cmp = memcmp(data1, data2, len * kind);
10955 return (cmp == 0);
10956}
10957
10958
Alexander Belopolsky40018472011-02-26 01:02:56 +000010959int
10960PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10963 if (PyUnicode_READY(left) == -1 ||
10964 PyUnicode_READY(right) == -1)
10965 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010966
10967 /* a string is equal to itself */
10968 if (left == right)
10969 return 0;
10970
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010971 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010973 PyErr_Format(PyExc_TypeError,
10974 "Can't compare %.100s and %.100s",
10975 left->ob_type->tp_name,
10976 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return -1;
10978}
10979
Martin v. Löwis5b222132007-06-10 09:51:05 +000010980int
10981PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 Py_ssize_t i;
10984 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010986 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987
Victor Stinner910337b2011-10-03 03:20:16 +020010988 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010989 if (!PyUnicode_IS_READY(uni)) {
10990 const wchar_t *ws = _PyUnicode_WSTR(uni);
10991 /* Compare Unicode string and source character set string */
10992 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10993 if (chr != ustr[i])
10994 return (chr < ustr[i]) ? -1 : 1;
10995 }
10996 /* This check keeps Python strings that end in '\0' from comparing equal
10997 to C strings identical up to that point. */
10998 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10999 return 1; /* uni is longer */
11000 if (ustr[i])
11001 return -1; /* str is longer */
11002 return 0;
11003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011005 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011006 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011007 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011008 size_t len, len2 = strlen(str);
11009 int cmp;
11010
11011 len = Py_MIN(len1, len2);
11012 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011013 if (cmp != 0) {
11014 if (cmp < 0)
11015 return -1;
11016 else
11017 return 1;
11018 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011019 if (len1 > len2)
11020 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011021 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011022 return -1; /* str is longer */
11023 return 0;
11024 }
11025 else {
11026 void *data = PyUnicode_DATA(uni);
11027 /* Compare Unicode string and source character set string */
11028 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011029 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011030 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11031 /* This check keeps Python strings that end in '\0' from comparing equal
11032 to C strings identical up to that point. */
11033 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11034 return 1; /* uni is longer */
11035 if (str[i])
11036 return -1; /* str is longer */
11037 return 0;
11038 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011039}
11040
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011041static int
11042non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11043{
11044 size_t i, len;
11045 const wchar_t *p;
11046 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11047 if (strlen(str) != len)
11048 return 0;
11049 p = _PyUnicode_WSTR(unicode);
11050 assert(p);
11051 for (i = 0; i < len; i++) {
11052 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011053 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011054 return 0;
11055 }
11056 return 1;
11057}
11058
11059int
11060_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11061{
11062 size_t len;
11063 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011064 assert(str);
11065#ifndef NDEBUG
11066 for (const char *p = str; *p; p++) {
11067 assert((unsigned char)*p < 128);
11068 }
11069#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011070 if (PyUnicode_READY(unicode) == -1) {
11071 /* Memory error or bad data */
11072 PyErr_Clear();
11073 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11074 }
11075 if (!PyUnicode_IS_ASCII(unicode))
11076 return 0;
11077 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11078 return strlen(str) == len &&
11079 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11080}
11081
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011082int
11083_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11084{
11085 PyObject *right_uni;
11086 Py_hash_t hash;
11087
11088 assert(_PyUnicode_CHECK(left));
11089 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011090#ifndef NDEBUG
11091 for (const char *p = right->string; *p; p++) {
11092 assert((unsigned char)*p < 128);
11093 }
11094#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011095
11096 if (PyUnicode_READY(left) == -1) {
11097 /* memory error or bad data */
11098 PyErr_Clear();
11099 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11100 }
11101
11102 if (!PyUnicode_IS_ASCII(left))
11103 return 0;
11104
11105 right_uni = _PyUnicode_FromId(right); /* borrowed */
11106 if (right_uni == NULL) {
11107 /* memory error or bad data */
11108 PyErr_Clear();
11109 return _PyUnicode_EqualToASCIIString(left, right->string);
11110 }
11111
11112 if (left == right_uni)
11113 return 1;
11114
11115 if (PyUnicode_CHECK_INTERNED(left))
11116 return 0;
11117
INADA Naoki7cc95f52018-01-28 02:07:09 +090011118 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011119 hash = _PyUnicode_HASH(left);
11120 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11121 return 0;
11122
11123 return unicode_compare_eq(left, right_uni);
11124}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011125
Alexander Belopolsky40018472011-02-26 01:02:56 +000011126PyObject *
11127PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011128{
11129 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011130
Victor Stinnere5567ad2012-10-23 02:48:49 +020011131 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11132 Py_RETURN_NOTIMPLEMENTED;
11133
11134 if (PyUnicode_READY(left) == -1 ||
11135 PyUnicode_READY(right) == -1)
11136 return NULL;
11137
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011138 if (left == right) {
11139 switch (op) {
11140 case Py_EQ:
11141 case Py_LE:
11142 case Py_GE:
11143 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011144 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011145 case Py_NE:
11146 case Py_LT:
11147 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011148 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011149 default:
11150 PyErr_BadArgument();
11151 return NULL;
11152 }
11153 }
11154 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011155 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011156 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011157 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011158 }
11159 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011160 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011161 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011162 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011163}
11164
Alexander Belopolsky40018472011-02-26 01:02:56 +000011165int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011166_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11167{
11168 return unicode_eq(aa, bb);
11169}
11170
11171int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011173{
Victor Stinner77282cb2013-04-14 19:22:47 +020011174 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 void *buf1, *buf2;
11176 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011177 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011178
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011179 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011181 "'in <string>' requires string as left operand, not %.100s",
11182 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011183 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011184 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011185 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187 if (ensure_unicode(str) < 0)
11188 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 kind2 = PyUnicode_KIND(substr);
11192 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011193 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 len2 = PyUnicode_GET_LENGTH(substr);
11196 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011197 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011198 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011199 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011200 if (len2 == 1) {
11201 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11202 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011203 return result;
11204 }
11205 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011206 buf2 = _PyUnicode_AsKind(substr, kind1);
11207 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011208 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210
Victor Stinner77282cb2013-04-14 19:22:47 +020011211 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 case PyUnicode_1BYTE_KIND:
11213 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11214 break;
11215 case PyUnicode_2BYTE_KIND:
11216 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11217 break;
11218 case PyUnicode_4BYTE_KIND:
11219 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11220 break;
11221 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011222 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224
Victor Stinner77282cb2013-04-14 19:22:47 +020011225 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 PyMem_Free(buf2);
11227
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011229}
11230
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231/* Concat to string or Unicode object giving a new Unicode object. */
11232
Alexander Belopolsky40018472011-02-26 01:02:56 +000011233PyObject *
11234PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011237 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011238 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011240 if (ensure_unicode(left) < 0)
11241 return NULL;
11242
11243 if (!PyUnicode_Check(right)) {
11244 PyErr_Format(PyExc_TypeError,
11245 "can only concatenate str (not \"%.200s\") to str",
11246 right->ob_type->tp_name);
11247 return NULL;
11248 }
11249 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
11252 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011253 if (left == unicode_empty)
11254 return PyUnicode_FromObject(right);
11255 if (right == unicode_empty)
11256 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011258 left_len = PyUnicode_GET_LENGTH(left);
11259 right_len = PyUnicode_GET_LENGTH(right);
11260 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011261 PyErr_SetString(PyExc_OverflowError,
11262 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011263 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011264 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011265 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011266
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011267 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11268 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011269 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 result = PyUnicode_New(new_len, maxchar);
11273 if (result == NULL)
11274 return NULL;
11275 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11276 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11277 assert(_PyUnicode_CheckConsistency(result, 1));
11278 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279}
11280
Walter Dörwald1ab83302007-05-18 17:15:44 +000011281void
Victor Stinner23e56682011-10-03 03:54:37 +020011282PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011283{
Victor Stinner23e56682011-10-03 03:54:37 +020011284 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011285 Py_UCS4 maxchar, maxchar2;
11286 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011287
11288 if (p_left == NULL) {
11289 if (!PyErr_Occurred())
11290 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291 return;
11292 }
Victor Stinner23e56682011-10-03 03:54:37 +020011293 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011294 if (right == NULL || left == NULL
11295 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011296 if (!PyErr_Occurred())
11297 PyErr_BadInternalCall();
11298 goto error;
11299 }
11300
Benjamin Petersonbac79492012-01-14 13:34:47 -050011301 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011302 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011303 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011304 goto error;
11305
Victor Stinner488fa492011-12-12 00:01:39 +010011306 /* Shortcuts */
11307 if (left == unicode_empty) {
11308 Py_DECREF(left);
11309 Py_INCREF(right);
11310 *p_left = right;
11311 return;
11312 }
11313 if (right == unicode_empty)
11314 return;
11315
11316 left_len = PyUnicode_GET_LENGTH(left);
11317 right_len = PyUnicode_GET_LENGTH(right);
11318 if (left_len > PY_SSIZE_T_MAX - right_len) {
11319 PyErr_SetString(PyExc_OverflowError,
11320 "strings are too large to concat");
11321 goto error;
11322 }
11323 new_len = left_len + right_len;
11324
11325 if (unicode_modifiable(left)
11326 && PyUnicode_CheckExact(right)
11327 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011328 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11329 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011330 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011331 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011332 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11333 {
11334 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011335 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011336 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011337
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011338 /* copy 'right' into the newly allocated area of 'left' */
11339 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011340 }
Victor Stinner488fa492011-12-12 00:01:39 +010011341 else {
11342 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11343 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011344 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011345
Victor Stinner488fa492011-12-12 00:01:39 +010011346 /* Concat the two Unicode strings */
11347 res = PyUnicode_New(new_len, maxchar);
11348 if (res == NULL)
11349 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011350 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11351 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011352 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011353 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011354 }
11355 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011356 return;
11357
11358error:
Victor Stinner488fa492011-12-12 00:01:39 +010011359 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011360}
11361
11362void
11363PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011365 PyUnicode_Append(pleft, right);
11366 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011367}
11368
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011369/*
11370Wraps stringlib_parse_args_finds() and additionally ensures that the
11371first argument is a unicode object.
11372*/
11373
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011374static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011375parse_args_finds_unicode(const char * function_name, PyObject *args,
11376 PyObject **substring,
11377 Py_ssize_t *start, Py_ssize_t *end)
11378{
11379 if(stringlib_parse_args_finds(function_name, args, substring,
11380 start, end)) {
11381 if (ensure_unicode(*substring) < 0)
11382 return 0;
11383 return 1;
11384 }
11385 return 0;
11386}
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011391Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011392string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
11395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011396unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011398 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011399 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011400 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011402 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 void *buf1, *buf2;
11404 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011406 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 kind1 = PyUnicode_KIND(self);
11410 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011411 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011412 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 len1 = PyUnicode_GET_LENGTH(self);
11415 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011417 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011418 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011420 buf1 = PyUnicode_DATA(self);
11421 buf2 = PyUnicode_DATA(substring);
11422 if (kind2 != kind1) {
11423 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011425 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 }
11427 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 case PyUnicode_1BYTE_KIND:
11429 iresult = ucs1lib_count(
11430 ((Py_UCS1*)buf1) + start, end - start,
11431 buf2, len2, PY_SSIZE_T_MAX
11432 );
11433 break;
11434 case PyUnicode_2BYTE_KIND:
11435 iresult = ucs2lib_count(
11436 ((Py_UCS2*)buf1) + start, end - start,
11437 buf2, len2, PY_SSIZE_T_MAX
11438 );
11439 break;
11440 case PyUnicode_4BYTE_KIND:
11441 iresult = ucs4lib_count(
11442 ((Py_UCS4*)buf1) + start, end - start,
11443 buf2, len2, PY_SSIZE_T_MAX
11444 );
11445 break;
11446 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011447 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 }
11449
11450 result = PyLong_FromSsize_t(iresult);
11451
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011452 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 return result;
11456}
11457
INADA Naoki3ae20562017-01-16 20:41:20 +090011458/*[clinic input]
11459str.encode as unicode_encode
11460
11461 encoding: str(c_default="NULL") = 'utf-8'
11462 The encoding in which to encode the string.
11463 errors: str(c_default="NULL") = 'strict'
11464 The error handling scheme to use for encoding errors.
11465 The default is 'strict' meaning that encoding errors raise a
11466 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11467 'xmlcharrefreplace' as well as any other name registered with
11468 codecs.register_error that can handle UnicodeEncodeErrors.
11469
11470Encode the string using the codec registered for encoding.
11471[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
11473static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011474unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011475/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011477 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011478}
11479
INADA Naoki3ae20562017-01-16 20:41:20 +090011480/*[clinic input]
11481str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
INADA Naoki3ae20562017-01-16 20:41:20 +090011483 tabsize: int = 8
11484
11485Return a copy where all tab characters are expanded using spaces.
11486
11487If tabsize is not given, a tab size of 8 characters is assumed.
11488[clinic start generated code]*/
11489
11490static PyObject *
11491unicode_expandtabs_impl(PyObject *self, int tabsize)
11492/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 Py_ssize_t i, j, line_pos, src_len, incr;
11495 Py_UCS4 ch;
11496 PyObject *u;
11497 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011499 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Antoine Pitrou22425222011-10-04 19:10:51 +020011501 if (PyUnicode_READY(self) == -1)
11502 return NULL;
11503
Thomas Wouters7e474022000-07-16 12:04:32 +000011504 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011505 src_len = PyUnicode_GET_LENGTH(self);
11506 i = j = line_pos = 0;
11507 kind = PyUnicode_KIND(self);
11508 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011509 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 for (; i < src_len; i++) {
11511 ch = PyUnicode_READ(kind, src_data, i);
11512 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011513 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011515 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 goto overflow;
11518 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011520 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011524 goto overflow;
11525 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 if (ch == '\n' || ch == '\r')
11528 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011531 if (!found)
11532 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011533
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 if (!u)
11537 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011538 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Antoine Pitroue71d5742011-10-04 15:55:09 +020011542 for (; i < src_len; i++) {
11543 ch = PyUnicode_READ(kind, src_data, i);
11544 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 incr = tabsize - (line_pos % tabsize);
11547 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011548 FILL(kind, dest_data, ' ', j, incr);
11549 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 line_pos++;
11554 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011555 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 if (ch == '\n' || ch == '\r')
11557 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011559 }
11560 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011561 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011562
Antoine Pitroue71d5742011-10-04 15:55:09 +020011563 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011564 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566}
11567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011568PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570\n\
11571Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011572such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573arguments start and end are interpreted as in slice notation.\n\
11574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011575Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
11577static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011580 /* initialize variables to prevent gcc warning */
11581 PyObject *substring = NULL;
11582 Py_ssize_t start = 0;
11583 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011584 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011586 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011589 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011592 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 if (result == -2)
11595 return NULL;
11596
Christian Heimes217cfd12007-12-02 14:31:20 +000011597 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
11600static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011601unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011603 void *data;
11604 enum PyUnicode_Kind kind;
11605 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011606
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011607 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011608 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011610 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011611 if (PyUnicode_READY(self) == -1) {
11612 return NULL;
11613 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011614 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11615 PyErr_SetString(PyExc_IndexError, "string index out of range");
11616 return NULL;
11617 }
11618 kind = PyUnicode_KIND(self);
11619 data = PyUnicode_DATA(self);
11620 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011621 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
Guido van Rossumc2504932007-09-18 19:42:40 +000011624/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011625 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011626static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011627unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628{
Guido van Rossumc2504932007-09-18 19:42:40 +000011629 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011630 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011631
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011632#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011633 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011634#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (_PyUnicode_HASH(self) != -1)
11636 return _PyUnicode_HASH(self);
11637 if (PyUnicode_READY(self) == -1)
11638 return -1;
11639 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011640 /*
11641 We make the hash of the empty string be 0, rather than using
11642 (prefix ^ suffix), since this slightly obfuscates the hash secret
11643 */
11644 if (len == 0) {
11645 _PyUnicode_HASH(self) = 0;
11646 return 0;
11647 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011648 x = _Py_HashBytes(PyUnicode_DATA(self),
11649 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011651 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652}
11653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011654PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011657Return the lowest index in S where substring sub is found, \n\
11658such that sub is contained within S[start:end]. Optional\n\
11659arguments start and end are interpreted as in slice notation.\n\
11660\n\
11661Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662
11663static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011666 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011667 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011668 PyObject *substring = NULL;
11669 Py_ssize_t start = 0;
11670 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011672 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011675 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011678 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (result == -2)
11681 return NULL;
11682
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 if (result < 0) {
11684 PyErr_SetString(PyExc_ValueError, "substring not found");
11685 return NULL;
11686 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687
Christian Heimes217cfd12007-12-02 14:31:20 +000011688 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
INADA Naoki3ae20562017-01-16 20:41:20 +090011691/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011692str.isascii as unicode_isascii
11693
11694Return True if all characters in the string are ASCII, False otherwise.
11695
11696ASCII characters have code points in the range U+0000-U+007F.
11697Empty string is ASCII too.
11698[clinic start generated code]*/
11699
11700static PyObject *
11701unicode_isascii_impl(PyObject *self)
11702/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11703{
11704 if (PyUnicode_READY(self) == -1) {
11705 return NULL;
11706 }
11707 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11708}
11709
11710/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011711str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
INADA Naoki3ae20562017-01-16 20:41:20 +090011713Return True if the string is a lowercase string, False otherwise.
11714
11715A string is lowercase if all cased characters in the string are lowercase and
11716there is at least one cased character in the string.
11717[clinic start generated code]*/
11718
11719static PyObject *
11720unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011721/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 Py_ssize_t i, length;
11724 int kind;
11725 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 int cased;
11727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PyUnicode_READY(self) == -1)
11729 return NULL;
11730 length = PyUnicode_GET_LENGTH(self);
11731 kind = PyUnicode_KIND(self);
11732 data = PyUnicode_DATA(self);
11733
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (length == 1)
11736 return PyBool_FromLong(
11737 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011739 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011741 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 for (i = 0; i < length; i++) {
11745 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011746
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011748 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 else if (!cased && Py_UNICODE_ISLOWER(ch))
11750 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011752 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
INADA Naoki3ae20562017-01-16 20:41:20 +090011755/*[clinic input]
11756str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
INADA Naoki3ae20562017-01-16 20:41:20 +090011758Return True if the string is an uppercase string, False otherwise.
11759
11760A string is uppercase if all cased characters in the string are uppercase and
11761there is at least one cased character in the string.
11762[clinic start generated code]*/
11763
11764static PyObject *
11765unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011766/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 Py_ssize_t i, length;
11769 int kind;
11770 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 int cased;
11772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 if (PyUnicode_READY(self) == -1)
11774 return NULL;
11775 length = PyUnicode_GET_LENGTH(self);
11776 kind = PyUnicode_KIND(self);
11777 data = PyUnicode_DATA(self);
11778
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 1)
11781 return PyBool_FromLong(
11782 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011786 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 for (i = 0; i < length; i++) {
11790 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011791
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011793 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 else if (!cased && Py_UNICODE_ISUPPER(ch))
11795 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011797 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798}
11799
INADA Naoki3ae20562017-01-16 20:41:20 +090011800/*[clinic input]
11801str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
INADA Naoki3ae20562017-01-16 20:41:20 +090011803Return True if the string is a title-cased string, False otherwise.
11804
11805In a title-cased string, upper- and title-case characters may only
11806follow uncased characters and lowercase characters only cased ones.
11807[clinic start generated code]*/
11808
11809static PyObject *
11810unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011811/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 Py_ssize_t i, length;
11814 int kind;
11815 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 int cased, previous_is_cased;
11817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (PyUnicode_READY(self) == -1)
11819 return NULL;
11820 length = PyUnicode_GET_LENGTH(self);
11821 kind = PyUnicode_KIND(self);
11822 data = PyUnicode_DATA(self);
11823
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 if (length == 1) {
11826 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11827 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11828 (Py_UNICODE_ISUPPER(ch) != 0));
11829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011831 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011833 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 cased = 0;
11836 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 for (i = 0; i < length; i++) {
11838 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011839
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11841 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011842 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 previous_is_cased = 1;
11844 cased = 1;
11845 }
11846 else if (Py_UNICODE_ISLOWER(ch)) {
11847 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011848 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 previous_is_cased = 1;
11850 cased = 1;
11851 }
11852 else
11853 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011855 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856}
11857
INADA Naoki3ae20562017-01-16 20:41:20 +090011858/*[clinic input]
11859str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
INADA Naoki3ae20562017-01-16 20:41:20 +090011861Return True if the string is a whitespace string, False otherwise.
11862
11863A string is whitespace if all characters in the string are whitespace and there
11864is at least one character in the string.
11865[clinic start generated code]*/
11866
11867static PyObject *
11868unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011869/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 Py_ssize_t i, length;
11872 int kind;
11873 void *data;
11874
11875 if (PyUnicode_READY(self) == -1)
11876 return NULL;
11877 length = PyUnicode_GET_LENGTH(self);
11878 kind = PyUnicode_KIND(self);
11879 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (length == 1)
11883 return PyBool_FromLong(
11884 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011886 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011888 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 for (i = 0; i < length; i++) {
11891 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011892 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011893 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011895 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896}
11897
INADA Naoki3ae20562017-01-16 20:41:20 +090011898/*[clinic input]
11899str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901Return True if the string is an alphabetic string, False otherwise.
11902
11903A string is alphabetic if all characters in the string are alphabetic and there
11904is at least one character in the string.
11905[clinic start generated code]*/
11906
11907static PyObject *
11908unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011909/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 Py_ssize_t i, length;
11912 int kind;
11913 void *data;
11914
11915 if (PyUnicode_READY(self) == -1)
11916 return NULL;
11917 length = PyUnicode_GET_LENGTH(self);
11918 kind = PyUnicode_KIND(self);
11919 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011920
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011921 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (length == 1)
11923 return PyBool_FromLong(
11924 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011925
11926 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011928 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 for (i = 0; i < length; i++) {
11931 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011932 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011933 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935}
11936
INADA Naoki3ae20562017-01-16 20:41:20 +090011937/*[clinic input]
11938str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939
INADA Naoki3ae20562017-01-16 20:41:20 +090011940Return True if the string is an alpha-numeric string, False otherwise.
11941
11942A string is alpha-numeric if all characters in the string are alpha-numeric and
11943there is at least one character in the string.
11944[clinic start generated code]*/
11945
11946static PyObject *
11947unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011948/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 int kind;
11951 void *data;
11952 Py_ssize_t len, i;
11953
11954 if (PyUnicode_READY(self) == -1)
11955 return NULL;
11956
11957 kind = PyUnicode_KIND(self);
11958 data = PyUnicode_DATA(self);
11959 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (len == 1) {
11963 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11964 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11965 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
11967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011969 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 for (i = 0; i < len; i++) {
11972 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011973 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011976 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977}
11978
INADA Naoki3ae20562017-01-16 20:41:20 +090011979/*[clinic input]
11980str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
INADA Naoki3ae20562017-01-16 20:41:20 +090011982Return True if the string is a decimal string, False otherwise.
11983
11984A string is a decimal string if all characters in the string are decimal and
11985there is at least one character in the string.
11986[clinic start generated code]*/
11987
11988static PyObject *
11989unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011990/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 Py_ssize_t i, length;
11993 int kind;
11994 void *data;
11995
11996 if (PyUnicode_READY(self) == -1)
11997 return NULL;
11998 length = PyUnicode_GET_LENGTH(self);
11999 kind = PyUnicode_KIND(self);
12000 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (length == 1)
12004 return PyBool_FromLong(
12005 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 for (i = 0; i < length; i++) {
12012 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012013 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016}
12017
INADA Naoki3ae20562017-01-16 20:41:20 +090012018/*[clinic input]
12019str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020
INADA Naoki3ae20562017-01-16 20:41:20 +090012021Return True if the string is a digit string, False otherwise.
12022
12023A string is a digit string if all characters in the string are digits and there
12024is at least one character in the string.
12025[clinic start generated code]*/
12026
12027static PyObject *
12028unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012029/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 Py_ssize_t i, length;
12032 int kind;
12033 void *data;
12034
12035 if (PyUnicode_READY(self) == -1)
12036 return NULL;
12037 length = PyUnicode_GET_LENGTH(self);
12038 kind = PyUnicode_KIND(self);
12039 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (length == 1) {
12043 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12044 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012047 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012049 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 for (i = 0; i < length; i++) {
12052 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012053 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056}
12057
INADA Naoki3ae20562017-01-16 20:41:20 +090012058/*[clinic input]
12059str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060
INADA Naoki3ae20562017-01-16 20:41:20 +090012061Return True if the string is a numeric string, False otherwise.
12062
12063A string is numeric if all characters in the string are numeric and there is at
12064least one character in the string.
12065[clinic start generated code]*/
12066
12067static PyObject *
12068unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012069/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 Py_ssize_t i, length;
12072 int kind;
12073 void *data;
12074
12075 if (PyUnicode_READY(self) == -1)
12076 return NULL;
12077 length = PyUnicode_GET_LENGTH(self);
12078 kind = PyUnicode_KIND(self);
12079 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (length == 1)
12083 return PyBool_FromLong(
12084 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012086 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 for (i = 0; i < length; i++) {
12091 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012092 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012094 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
Martin v. Löwis47383402007-08-15 07:32:56 +000012097int
12098PyUnicode_IsIdentifier(PyObject *self)
12099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 int kind;
12101 void *data;
12102 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012103 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (PyUnicode_READY(self) == -1) {
12106 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 }
12109
12110 /* Special case for empty strings */
12111 if (PyUnicode_GET_LENGTH(self) == 0)
12112 return 0;
12113 kind = PyUnicode_KIND(self);
12114 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012115
12116 /* PEP 3131 says that the first character must be in
12117 XID_Start and subsequent characters in XID_Continue,
12118 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012120 letters, digits, underscore). However, given the current
12121 definition of XID_Start and XID_Continue, it is sufficient
12122 to check just for these, except that _ must be allowed
12123 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012125 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012126 return 0;
12127
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012128 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012131 return 1;
12132}
12133
INADA Naoki3ae20562017-01-16 20:41:20 +090012134/*[clinic input]
12135str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012136
INADA Naoki3ae20562017-01-16 20:41:20 +090012137Return True if the string is a valid Python identifier, False otherwise.
12138
12139Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12140"class".
12141[clinic start generated code]*/
12142
12143static PyObject *
12144unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012145/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012146{
12147 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12148}
12149
INADA Naoki3ae20562017-01-16 20:41:20 +090012150/*[clinic input]
12151str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012152
INADA Naoki3ae20562017-01-16 20:41:20 +090012153Return True if the string is printable, False otherwise.
12154
12155A string is printable if all of its characters are considered printable in
12156repr() or if it is empty.
12157[clinic start generated code]*/
12158
12159static PyObject *
12160unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012161/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 Py_ssize_t i, length;
12164 int kind;
12165 void *data;
12166
12167 if (PyUnicode_READY(self) == -1)
12168 return NULL;
12169 length = PyUnicode_GET_LENGTH(self);
12170 kind = PyUnicode_KIND(self);
12171 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012172
12173 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (length == 1)
12175 return PyBool_FromLong(
12176 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 for (i = 0; i < length; i++) {
12179 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012180 Py_RETURN_FALSE;
12181 }
12182 }
12183 Py_RETURN_TRUE;
12184}
12185
INADA Naoki3ae20562017-01-16 20:41:20 +090012186/*[clinic input]
12187str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
INADA Naoki3ae20562017-01-16 20:41:20 +090012189 iterable: object
12190 /
12191
12192Concatenate any number of strings.
12193
Martin Panter91a88662017-01-24 00:30:06 +000012194The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012195The result is returned as a new string.
12196
12197Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12198[clinic start generated code]*/
12199
12200static PyObject *
12201unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012202/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
INADA Naoki3ae20562017-01-16 20:41:20 +090012204 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205}
12206
Martin v. Löwis18e16552006-02-15 17:27:45 +000012207static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012208unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (PyUnicode_READY(self) == -1)
12211 return -1;
12212 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213}
12214
INADA Naoki3ae20562017-01-16 20:41:20 +090012215/*[clinic input]
12216str.ljust as unicode_ljust
12217
12218 width: Py_ssize_t
12219 fillchar: Py_UCS4 = ' '
12220 /
12221
12222Return a left-justified string of length width.
12223
12224Padding is done using the specified fill character (default is a space).
12225[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
12227static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012228unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12229/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012231 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Victor Stinnerc4b49542011-12-11 22:44:26 +010012234 if (PyUnicode_GET_LENGTH(self) >= width)
12235 return unicode_result_unchanged(self);
12236
12237 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
INADA Naoki3ae20562017-01-16 20:41:20 +090012240/*[clinic input]
12241str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
INADA Naoki3ae20562017-01-16 20:41:20 +090012243Return a copy of the string converted to lowercase.
12244[clinic start generated code]*/
12245
12246static PyObject *
12247unicode_lower_impl(PyObject *self)
12248/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012250 if (PyUnicode_READY(self) == -1)
12251 return NULL;
12252 if (PyUnicode_IS_ASCII(self))
12253 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012254 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255}
12256
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012257#define LEFTSTRIP 0
12258#define RIGHTSTRIP 1
12259#define BOTHSTRIP 2
12260
12261/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012262static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012265
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266/* externally visible for str.strip(unicode) */
12267PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012268_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 void *data;
12271 int kind;
12272 Py_ssize_t i, j, len;
12273 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012274 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12277 return NULL;
12278
12279 kind = PyUnicode_KIND(self);
12280 data = PyUnicode_DATA(self);
12281 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012282 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12284 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012285 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286
Benjamin Peterson14339b62009-01-31 16:36:08 +000012287 i = 0;
12288 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012289 while (i < len) {
12290 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12291 if (!BLOOM(sepmask, ch))
12292 break;
12293 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12294 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 i++;
12296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 j = len;
12300 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012301 j--;
12302 while (j >= i) {
12303 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12304 if (!BLOOM(sepmask, ch))
12305 break;
12306 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12307 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 }
12310
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012313
Victor Stinner7931d9a2011-11-04 00:22:48 +010012314 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315}
12316
12317PyObject*
12318PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12319{
12320 unsigned char *data;
12321 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012322 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323
Victor Stinnerde636f32011-10-01 03:55:54 +020012324 if (PyUnicode_READY(self) == -1)
12325 return NULL;
12326
Victor Stinner684d5fd2012-05-03 02:32:34 +020012327 length = PyUnicode_GET_LENGTH(self);
12328 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012329
Victor Stinner684d5fd2012-05-03 02:32:34 +020012330 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012331 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332
Victor Stinnerde636f32011-10-01 03:55:54 +020012333 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012334 PyErr_SetString(PyExc_IndexError, "string index out of range");
12335 return NULL;
12336 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012337 if (start >= length || end < start)
12338 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012339
Victor Stinner684d5fd2012-05-03 02:32:34 +020012340 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012341 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012342 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012343 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012344 }
12345 else {
12346 kind = PyUnicode_KIND(self);
12347 data = PyUnicode_1BYTE_DATA(self);
12348 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012349 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012350 length);
12351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
12354static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012355do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 Py_ssize_t len, i, j;
12358
12359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
12361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363
Victor Stinnercc7af722013-04-09 22:39:24 +020012364 if (PyUnicode_IS_ASCII(self)) {
12365 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12366
12367 i = 0;
12368 if (striptype != RIGHTSTRIP) {
12369 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012370 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012371 if (!_Py_ascii_whitespace[ch])
12372 break;
12373 i++;
12374 }
12375 }
12376
12377 j = len;
12378 if (striptype != LEFTSTRIP) {
12379 j--;
12380 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012381 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012382 if (!_Py_ascii_whitespace[ch])
12383 break;
12384 j--;
12385 }
12386 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 }
12388 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012389 else {
12390 int kind = PyUnicode_KIND(self);
12391 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012392
Victor Stinnercc7af722013-04-09 22:39:24 +020012393 i = 0;
12394 if (striptype != RIGHTSTRIP) {
12395 while (i < len) {
12396 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12397 if (!Py_UNICODE_ISSPACE(ch))
12398 break;
12399 i++;
12400 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012401 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012402
12403 j = len;
12404 if (striptype != LEFTSTRIP) {
12405 j--;
12406 while (j >= i) {
12407 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12408 if (!Py_UNICODE_ISSPACE(ch))
12409 break;
12410 j--;
12411 }
12412 j++;
12413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012415
Victor Stinner7931d9a2011-11-04 00:22:48 +010012416 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417}
12418
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419
12420static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012421do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012422{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 if (sep != NULL && sep != Py_None) {
12424 if (PyUnicode_Check(sep))
12425 return _PyUnicode_XStrip(self, striptype, sep);
12426 else {
12427 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 "%s arg must be None or str",
12429 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 return NULL;
12431 }
12432 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435}
12436
12437
INADA Naoki3ae20562017-01-16 20:41:20 +090012438/*[clinic input]
12439str.strip as unicode_strip
12440
12441 chars: object = None
12442 /
12443
Victor Stinner0c4a8282017-01-17 02:21:47 +010012444Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012445
12446If chars is given and not None, remove characters in chars instead.
12447[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012448
12449static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012450unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012451/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012452{
INADA Naoki3ae20562017-01-16 20:41:20 +090012453 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454}
12455
12456
INADA Naoki3ae20562017-01-16 20:41:20 +090012457/*[clinic input]
12458str.lstrip as unicode_lstrip
12459
12460 chars: object = NULL
12461 /
12462
12463Return a copy of the string with leading whitespace removed.
12464
12465If chars is given and not None, remove characters in chars instead.
12466[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467
12468static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012469unicode_lstrip_impl(PyObject *self, PyObject *chars)
12470/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012471{
INADA Naoki3ae20562017-01-16 20:41:20 +090012472 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473}
12474
12475
INADA Naoki3ae20562017-01-16 20:41:20 +090012476/*[clinic input]
12477str.rstrip as unicode_rstrip
12478
12479 chars: object = NULL
12480 /
12481
12482Return a copy of the string with trailing whitespace removed.
12483
12484If chars is given and not None, remove characters in chars instead.
12485[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012486
12487static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012488unicode_rstrip_impl(PyObject *self, PyObject *chars)
12489/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490{
INADA Naoki3ae20562017-01-16 20:41:20 +090012491 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492}
12493
12494
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012496unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012498 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
Serhiy Storchaka05997252013-01-26 12:14:02 +020012501 if (len < 1)
12502 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
Victor Stinnerc4b49542011-12-11 22:44:26 +010012504 /* no repeat, return original string */
12505 if (len == 1)
12506 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012507
Benjamin Petersonbac79492012-01-14 13:34:47 -050012508 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 return NULL;
12510
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012511 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012512 PyErr_SetString(PyExc_OverflowError,
12513 "repeated string is too long");
12514 return NULL;
12515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012517
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012518 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519 if (!u)
12520 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012521 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 if (PyUnicode_GET_LENGTH(str) == 1) {
12524 const int kind = PyUnicode_KIND(str);
12525 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012526 if (kind == PyUnicode_1BYTE_KIND) {
12527 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012528 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012529 }
12530 else if (kind == PyUnicode_2BYTE_KIND) {
12531 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012532 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012533 ucs2[n] = fill_char;
12534 } else {
12535 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12536 assert(kind == PyUnicode_4BYTE_KIND);
12537 for (n = 0; n < len; ++n)
12538 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 }
12541 else {
12542 /* number of characters copied this far */
12543 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012544 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012546 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012550 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012551 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 }
12554
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012555 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012556 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557}
12558
Alexander Belopolsky40018472011-02-26 01:02:56 +000012559PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012560PyUnicode_Replace(PyObject *str,
12561 PyObject *substr,
12562 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012563 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012565 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12566 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012568 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569}
12570
INADA Naoki3ae20562017-01-16 20:41:20 +090012571/*[clinic input]
12572str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
INADA Naoki3ae20562017-01-16 20:41:20 +090012574 old: unicode
12575 new: unicode
12576 count: Py_ssize_t = -1
12577 Maximum number of occurrences to replace.
12578 -1 (the default value) means replace all occurrences.
12579 /
12580
12581Return a copy with all occurrences of substring old replaced by new.
12582
12583If the optional argument count is given, only the first count occurrences are
12584replaced.
12585[clinic start generated code]*/
12586
12587static PyObject *
12588unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12589 Py_ssize_t count)
12590/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012592 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012594 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595}
12596
Alexander Belopolsky40018472011-02-26 01:02:56 +000012597static PyObject *
12598unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012600 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 Py_ssize_t isize;
12602 Py_ssize_t osize, squote, dquote, i, o;
12603 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012604 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012608 return NULL;
12609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 isize = PyUnicode_GET_LENGTH(unicode);
12611 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 /* Compute length of output, quote characters, and
12614 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012615 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 max = 127;
12617 squote = dquote = 0;
12618 ikind = PyUnicode_KIND(unicode);
12619 for (i = 0; i < isize; i++) {
12620 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012621 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012623 case '\'': squote++; break;
12624 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012626 incr = 2;
12627 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 default:
12629 /* Fast-path ASCII */
12630 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012631 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012633 ;
12634 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012637 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012639 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012641 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 if (osize > PY_SSIZE_T_MAX - incr) {
12644 PyErr_SetString(PyExc_OverflowError,
12645 "string is too long to generate repr");
12646 return NULL;
12647 }
12648 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 }
12650
12651 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012652 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012654 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 if (dquote)
12656 /* Both squote and dquote present. Use squote,
12657 and escape them */
12658 osize += squote;
12659 else
12660 quote = '"';
12661 }
Victor Stinner55c08782013-04-14 18:45:39 +020012662 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663
12664 repr = PyUnicode_New(osize, max);
12665 if (repr == NULL)
12666 return NULL;
12667 okind = PyUnicode_KIND(repr);
12668 odata = PyUnicode_DATA(repr);
12669
12670 PyUnicode_WRITE(okind, odata, 0, quote);
12671 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012672 if (unchanged) {
12673 _PyUnicode_FastCopyCharacters(repr, 1,
12674 unicode, 0,
12675 isize);
12676 }
12677 else {
12678 for (i = 0, o = 1; i < isize; i++) {
12679 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680
Victor Stinner55c08782013-04-14 18:45:39 +020012681 /* Escape quotes and backslashes */
12682 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012683 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012685 continue;
12686 }
12687
12688 /* Map special whitespace to '\t', \n', '\r' */
12689 if (ch == '\t') {
12690 PyUnicode_WRITE(okind, odata, o++, '\\');
12691 PyUnicode_WRITE(okind, odata, o++, 't');
12692 }
12693 else if (ch == '\n') {
12694 PyUnicode_WRITE(okind, odata, o++, '\\');
12695 PyUnicode_WRITE(okind, odata, o++, 'n');
12696 }
12697 else if (ch == '\r') {
12698 PyUnicode_WRITE(okind, odata, o++, '\\');
12699 PyUnicode_WRITE(okind, odata, o++, 'r');
12700 }
12701
12702 /* Map non-printable US ASCII to '\xhh' */
12703 else if (ch < ' ' || ch == 0x7F) {
12704 PyUnicode_WRITE(okind, odata, o++, '\\');
12705 PyUnicode_WRITE(okind, odata, o++, 'x');
12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12708 }
12709
12710 /* Copy ASCII characters as-is */
12711 else if (ch < 0x7F) {
12712 PyUnicode_WRITE(okind, odata, o++, ch);
12713 }
12714
12715 /* Non-ASCII characters */
12716 else {
12717 /* Map Unicode whitespace and control characters
12718 (categories Z* and C* except ASCII space)
12719 */
12720 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12721 PyUnicode_WRITE(okind, odata, o++, '\\');
12722 /* Map 8-bit characters to '\xhh' */
12723 if (ch <= 0xff) {
12724 PyUnicode_WRITE(okind, odata, o++, 'x');
12725 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12726 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12727 }
12728 /* Map 16-bit characters to '\uxxxx' */
12729 else if (ch <= 0xffff) {
12730 PyUnicode_WRITE(okind, odata, o++, 'u');
12731 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12732 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12733 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12735 }
12736 /* Map 21-bit characters to '\U00xxxxxx' */
12737 else {
12738 PyUnicode_WRITE(okind, odata, o++, 'U');
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12747 }
12748 }
12749 /* Copy characters as-is */
12750 else {
12751 PyUnicode_WRITE(okind, odata, o++, ch);
12752 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012753 }
12754 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012757 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012758 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759}
12760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012761PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763\n\
12764Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012765such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766arguments start and end are interpreted as in slice notation.\n\
12767\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012768Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769
12770static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012773 /* initialize variables to prevent gcc warning */
12774 PyObject *substring = NULL;
12775 Py_ssize_t start = 0;
12776 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012779 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012782 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012785 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787 if (result == -2)
12788 return NULL;
12789
Christian Heimes217cfd12007-12-02 14:31:20 +000012790 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791}
12792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012793PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012796Return the highest index in S where substring sub is found,\n\
12797such that sub is contained within S[start:end]. Optional\n\
12798arguments start and end are interpreted as in slice notation.\n\
12799\n\
12800Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801
12802static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012805 /* initialize variables to prevent gcc warning */
12806 PyObject *substring = NULL;
12807 Py_ssize_t start = 0;
12808 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012811 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012814 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012817 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 if (result == -2)
12820 return NULL;
12821
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822 if (result < 0) {
12823 PyErr_SetString(PyExc_ValueError, "substring not found");
12824 return NULL;
12825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
Christian Heimes217cfd12007-12-02 14:31:20 +000012827 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828}
12829
INADA Naoki3ae20562017-01-16 20:41:20 +090012830/*[clinic input]
12831str.rjust as unicode_rjust
12832
12833 width: Py_ssize_t
12834 fillchar: Py_UCS4 = ' '
12835 /
12836
12837Return a right-justified string of length width.
12838
12839Padding is done using the specified fill character (default is a space).
12840[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
12842static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012843unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12844/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012846 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 return NULL;
12848
Victor Stinnerc4b49542011-12-11 22:44:26 +010012849 if (PyUnicode_GET_LENGTH(self) >= width)
12850 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
Victor Stinnerc4b49542011-12-11 22:44:26 +010012852 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
12854
Alexander Belopolsky40018472011-02-26 01:02:56 +000012855PyObject *
12856PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
INADA Naoki3ae20562017-01-16 20:41:20 +090012864/*[clinic input]
12865str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
INADA Naoki3ae20562017-01-16 20:41:20 +090012867 sep: object = None
12868 The delimiter according which to split the string.
12869 None (the default value) means split according to any whitespace,
12870 and discard empty strings from the result.
12871 maxsplit: Py_ssize_t = -1
12872 Maximum number of splits to do.
12873 -1 (the default value) means no limit.
12874
12875Return a list of the words in the string, using sep as the delimiter string.
12876[clinic start generated code]*/
12877
12878static PyObject *
12879unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12880/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881{
INADA Naoki3ae20562017-01-16 20:41:20 +090012882 if (sep == Py_None)
12883 return split(self, NULL, maxsplit);
12884 if (PyUnicode_Check(sep))
12885 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012886
12887 PyErr_Format(PyExc_TypeError,
12888 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012889 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891}
12892
Thomas Wouters477c8d52006-05-27 19:21:47 +000012893PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012895{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012897 int kind1, kind2;
12898 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012901 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012902 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903
Victor Stinner14f8f022011-10-05 20:58:25 +020012904 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 len1 = PyUnicode_GET_LENGTH(str_obj);
12907 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012908 if (kind1 < kind2 || len1 < len2) {
12909 _Py_INCREF_UNICODE_EMPTY();
12910 if (!unicode_empty)
12911 out = NULL;
12912 else {
12913 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12914 Py_DECREF(unicode_empty);
12915 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012916 return out;
12917 }
12918 buf1 = PyUnicode_DATA(str_obj);
12919 buf2 = PyUnicode_DATA(sep_obj);
12920 if (kind2 != kind1) {
12921 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12922 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012923 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012926 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012928 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12929 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12930 else
12931 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 break;
12933 case PyUnicode_2BYTE_KIND:
12934 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12935 break;
12936 case PyUnicode_4BYTE_KIND:
12937 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938 break;
12939 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012940 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012942
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012943 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012945
12946 return out;
12947}
12948
12949
12950PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012951PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012952{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012953 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 int kind1, kind2;
12955 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012958 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012959 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012961 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 len1 = PyUnicode_GET_LENGTH(str_obj);
12964 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012965 if (kind1 < kind2 || len1 < len2) {
12966 _Py_INCREF_UNICODE_EMPTY();
12967 if (!unicode_empty)
12968 out = NULL;
12969 else {
12970 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12971 Py_DECREF(unicode_empty);
12972 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012973 return out;
12974 }
12975 buf1 = PyUnicode_DATA(str_obj);
12976 buf2 = PyUnicode_DATA(sep_obj);
12977 if (kind2 != kind1) {
12978 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12979 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012983 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012985 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12986 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12987 else
12988 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 break;
12990 case PyUnicode_2BYTE_KIND:
12991 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12992 break;
12993 case PyUnicode_4BYTE_KIND:
12994 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12995 break;
12996 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012997 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002
13003 return out;
13004}
13005
INADA Naoki3ae20562017-01-16 20:41:20 +090013006/*[clinic input]
13007str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013008
INADA Naoki3ae20562017-01-16 20:41:20 +090013009 sep: object
13010 /
13011
13012Partition the string into three parts using the given separator.
13013
13014This will search for the separator in the string. If the separator is found,
13015returns a 3-tuple containing the part before the separator, the separator
13016itself, and the part after it.
13017
13018If the separator is not found, returns a 3-tuple containing the original string
13019and two empty strings.
13020[clinic start generated code]*/
13021
13022static PyObject *
13023unicode_partition(PyObject *self, PyObject *sep)
13024/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013025{
INADA Naoki3ae20562017-01-16 20:41:20 +090013026 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013027}
13028
INADA Naoki3ae20562017-01-16 20:41:20 +090013029/*[clinic input]
13030str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031
INADA Naoki3ae20562017-01-16 20:41:20 +090013032Partition the string into three parts using the given separator.
13033
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013034This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013035the separator is found, returns a 3-tuple containing the part before the
13036separator, the separator itself, and the part after it.
13037
13038If the separator is not found, returns a 3-tuple containing two empty strings
13039and the original string.
13040[clinic start generated code]*/
13041
13042static PyObject *
13043unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013044/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045{
INADA Naoki3ae20562017-01-16 20:41:20 +090013046 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013047}
13048
Alexander Belopolsky40018472011-02-26 01:02:56 +000013049PyObject *
13050PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013051{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013052 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013053 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013054
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013055 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013056}
13057
INADA Naoki3ae20562017-01-16 20:41:20 +090013058/*[clinic input]
13059str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013060
INADA Naoki3ae20562017-01-16 20:41:20 +090013061Return a list of the words in the string, using sep as the delimiter string.
13062
13063Splits are done starting at the end of the string and working to the front.
13064[clinic start generated code]*/
13065
13066static PyObject *
13067unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13068/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013069{
INADA Naoki3ae20562017-01-16 20:41:20 +090013070 if (sep == Py_None)
13071 return rsplit(self, NULL, maxsplit);
13072 if (PyUnicode_Check(sep))
13073 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013074
13075 PyErr_Format(PyExc_TypeError,
13076 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013077 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013078 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013079}
13080
INADA Naoki3ae20562017-01-16 20:41:20 +090013081/*[clinic input]
13082str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013084 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013085
13086Return a list of the lines in the string, breaking at line boundaries.
13087
13088Line breaks are not included in the resulting list unless keepends is given and
13089true.
13090[clinic start generated code]*/
13091
13092static PyObject *
13093unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013094/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013096 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097}
13098
13099static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013100PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013102 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103}
13104
INADA Naoki3ae20562017-01-16 20:41:20 +090013105/*[clinic input]
13106str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
INADA Naoki3ae20562017-01-16 20:41:20 +090013108Convert uppercase characters to lowercase and lowercase characters to uppercase.
13109[clinic start generated code]*/
13110
13111static PyObject *
13112unicode_swapcase_impl(PyObject *self)
13113/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013115 if (PyUnicode_READY(self) == -1)
13116 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013117 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118}
13119
Larry Hastings61272b72014-01-07 12:41:53 -080013120/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013121
Larry Hastings31826802013-10-19 00:09:25 -070013122@staticmethod
13123str.maketrans as unicode_maketrans
13124
13125 x: object
13126
13127 y: unicode=NULL
13128
13129 z: unicode=NULL
13130
13131 /
13132
13133Return a translation table usable for str.translate().
13134
13135If there is only one argument, it must be a dictionary mapping Unicode
13136ordinals (integers) or characters to Unicode ordinals, strings or None.
13137Character keys will be then converted to ordinals.
13138If there are two arguments, they must be strings of equal length, and
13139in the resulting dictionary, each character in x will be mapped to the
13140character at the same position in y. If there is a third argument, it
13141must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013142[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013143
Larry Hastings31826802013-10-19 00:09:25 -070013144static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013145unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013146/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013147{
Georg Brandlceee0772007-11-27 23:48:05 +000013148 PyObject *new = NULL, *key, *value;
13149 Py_ssize_t i = 0;
13150 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151
Georg Brandlceee0772007-11-27 23:48:05 +000013152 new = PyDict_New();
13153 if (!new)
13154 return NULL;
13155 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 int x_kind, y_kind, z_kind;
13157 void *x_data, *y_data, *z_data;
13158
Georg Brandlceee0772007-11-27 23:48:05 +000013159 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013160 if (!PyUnicode_Check(x)) {
13161 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13162 "be a string if there is a second argument");
13163 goto err;
13164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013166 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13167 "arguments must have equal length");
13168 goto err;
13169 }
13170 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 x_kind = PyUnicode_KIND(x);
13172 y_kind = PyUnicode_KIND(y);
13173 x_data = PyUnicode_DATA(x);
13174 y_data = PyUnicode_DATA(y);
13175 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13176 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013177 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013178 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013179 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013180 if (!value) {
13181 Py_DECREF(key);
13182 goto err;
13183 }
Georg Brandlceee0772007-11-27 23:48:05 +000013184 res = PyDict_SetItem(new, key, value);
13185 Py_DECREF(key);
13186 Py_DECREF(value);
13187 if (res < 0)
13188 goto err;
13189 }
13190 /* create entries for deleting chars in z */
13191 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 z_kind = PyUnicode_KIND(z);
13193 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013194 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013196 if (!key)
13197 goto err;
13198 res = PyDict_SetItem(new, key, Py_None);
13199 Py_DECREF(key);
13200 if (res < 0)
13201 goto err;
13202 }
13203 }
13204 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 int kind;
13206 void *data;
13207
Georg Brandlceee0772007-11-27 23:48:05 +000013208 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013209 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013210 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13211 "to maketrans it must be a dict");
13212 goto err;
13213 }
13214 /* copy entries into the new dict, converting string keys to int keys */
13215 while (PyDict_Next(x, &i, &key, &value)) {
13216 if (PyUnicode_Check(key)) {
13217 /* convert string keys to integer keys */
13218 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013219 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013220 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13221 "table must be of length 1");
13222 goto err;
13223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 kind = PyUnicode_KIND(key);
13225 data = PyUnicode_DATA(key);
13226 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013227 if (!newkey)
13228 goto err;
13229 res = PyDict_SetItem(new, newkey, value);
13230 Py_DECREF(newkey);
13231 if (res < 0)
13232 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013233 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013234 /* just keep integer keys */
13235 if (PyDict_SetItem(new, key, value) < 0)
13236 goto err;
13237 } else {
13238 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13239 "be strings or integers");
13240 goto err;
13241 }
13242 }
13243 }
13244 return new;
13245 err:
13246 Py_DECREF(new);
13247 return NULL;
13248}
13249
INADA Naoki3ae20562017-01-16 20:41:20 +090013250/*[clinic input]
13251str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252
INADA Naoki3ae20562017-01-16 20:41:20 +090013253 table: object
13254 Translation table, which must be a mapping of Unicode ordinals to
13255 Unicode ordinals, strings, or None.
13256 /
13257
13258Replace each character in the string using the given translation table.
13259
13260The table must implement lookup/indexing via __getitem__, for instance a
13261dictionary or list. If this operation raises LookupError, the character is
13262left untouched. Characters mapped to None are deleted.
13263[clinic start generated code]*/
13264
13265static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013267/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270}
13271
INADA Naoki3ae20562017-01-16 20:41:20 +090013272/*[clinic input]
13273str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
INADA Naoki3ae20562017-01-16 20:41:20 +090013275Return a copy of the string converted to uppercase.
13276[clinic start generated code]*/
13277
13278static PyObject *
13279unicode_upper_impl(PyObject *self)
13280/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013282 if (PyUnicode_READY(self) == -1)
13283 return NULL;
13284 if (PyUnicode_IS_ASCII(self))
13285 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013286 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287}
13288
INADA Naoki3ae20562017-01-16 20:41:20 +090013289/*[clinic input]
13290str.zfill as unicode_zfill
13291
13292 width: Py_ssize_t
13293 /
13294
13295Pad a numeric string with zeros on the left, to fill a field of the given width.
13296
13297The string is never truncated.
13298[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299
13300static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013301unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013302/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013304 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013305 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 int kind;
13307 void *data;
13308 Py_UCS4 chr;
13309
Benjamin Petersonbac79492012-01-14 13:34:47 -050013310 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312
Victor Stinnerc4b49542011-12-11 22:44:26 +010013313 if (PyUnicode_GET_LENGTH(self) >= width)
13314 return unicode_result_unchanged(self);
13315
13316 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317
13318 u = pad(self, fill, 0, '0');
13319
Walter Dörwald068325e2002-04-15 13:36:47 +000013320 if (u == NULL)
13321 return NULL;
13322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 kind = PyUnicode_KIND(u);
13324 data = PyUnicode_DATA(u);
13325 chr = PyUnicode_READ(kind, data, fill);
13326
13327 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 PyUnicode_WRITE(kind, data, 0, chr);
13330 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331 }
13332
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013333 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013334 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336
13337#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013338static PyObject *
13339unicode__decimal2ascii(PyObject *self)
13340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013342}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343#endif
13344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013345PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013348Return True if S starts with the specified prefix, False otherwise.\n\
13349With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013350With optional end, stop comparing S at that position.\n\
13351prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352
13353static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013354unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013357 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013358 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013359 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013360 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013361 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
Jesus Ceaac451502011-04-20 17:09:23 +020013363 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013365 if (PyTuple_Check(subobj)) {
13366 Py_ssize_t i;
13367 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013368 substring = PyTuple_GET_ITEM(subobj, i);
13369 if (!PyUnicode_Check(substring)) {
13370 PyErr_Format(PyExc_TypeError,
13371 "tuple for startswith must only contain str, "
13372 "not %.100s",
13373 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013374 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013375 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013377 if (result == -1)
13378 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 if (result) {
13380 Py_RETURN_TRUE;
13381 }
13382 }
13383 /* nothing matched */
13384 Py_RETURN_FALSE;
13385 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 if (!PyUnicode_Check(subobj)) {
13387 PyErr_Format(PyExc_TypeError,
13388 "startswith first arg must be str or "
13389 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013391 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013392 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013393 if (result == -1)
13394 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013395 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396}
13397
13398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013399PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013402Return True if S ends with the specified suffix, False otherwise.\n\
13403With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404With optional end, stop comparing S at that position.\n\
13405suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406
13407static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013408unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013411 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013412 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013413 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013414 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416
Jesus Ceaac451502011-04-20 17:09:23 +020013417 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419 if (PyTuple_Check(subobj)) {
13420 Py_ssize_t i;
13421 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422 substring = PyTuple_GET_ITEM(subobj, i);
13423 if (!PyUnicode_Check(substring)) {
13424 PyErr_Format(PyExc_TypeError,
13425 "tuple for endswith must only contain str, "
13426 "not %.100s",
13427 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013429 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013430 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013431 if (result == -1)
13432 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013433 if (result) {
13434 Py_RETURN_TRUE;
13435 }
13436 }
13437 Py_RETURN_FALSE;
13438 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439 if (!PyUnicode_Check(subobj)) {
13440 PyErr_Format(PyExc_TypeError,
13441 "endswith first arg must be str or "
13442 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013444 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013445 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013446 if (result == -1)
13447 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013448 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449}
13450
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013451static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013452_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013453{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013454 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13455 writer->data = PyUnicode_DATA(writer->buffer);
13456
13457 if (!writer->readonly) {
13458 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013459 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013460 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013461 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013462 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13463 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13464 writer->kind = PyUnicode_WCHAR_KIND;
13465 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13466
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 /* Copy-on-write mode: set buffer size to 0 so
13468 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13469 * next write. */
13470 writer->size = 0;
13471 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013472}
13473
Victor Stinnerd3f08822012-05-29 12:57:52 +020013474void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013476{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013478
13479 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013480 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013481
13482 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13483 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13484 writer->kind = PyUnicode_WCHAR_KIND;
13485 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013486}
13487
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488int
13489_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13490 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013491{
13492 Py_ssize_t newlen;
13493 PyObject *newbuffer;
13494
Victor Stinner2740e462016-09-06 16:58:36 -070013495 assert(maxchar <= MAX_UNICODE);
13496
Victor Stinnerca9381e2015-09-22 00:58:32 +020013497 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013498 assert((maxchar > writer->maxchar && length >= 0)
13499 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500
Victor Stinner202fdca2012-05-07 12:47:02 +020013501 if (length > PY_SSIZE_T_MAX - writer->pos) {
13502 PyErr_NoMemory();
13503 return -1;
13504 }
13505 newlen = writer->pos + length;
13506
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013507 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013508
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013510 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013511 if (writer->overallocate
13512 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13513 /* overallocate to limit the number of realloc() */
13514 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 if (newlen < writer->min_length)
13517 newlen = writer->min_length;
13518
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 writer->buffer = PyUnicode_New(newlen, maxchar);
13520 if (writer->buffer == NULL)
13521 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013523 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013524 if (writer->overallocate
13525 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13526 /* overallocate to limit the number of realloc() */
13527 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013528 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013529 if (newlen < writer->min_length)
13530 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013531
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013532 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013533 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013534 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013535 newbuffer = PyUnicode_New(newlen, maxchar);
13536 if (newbuffer == NULL)
13537 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13539 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013540 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013541 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013542 }
13543 else {
13544 newbuffer = resize_compact(writer->buffer, newlen);
13545 if (newbuffer == NULL)
13546 return -1;
13547 }
13548 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013549 }
13550 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013551 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552 newbuffer = PyUnicode_New(writer->size, maxchar);
13553 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013554 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13556 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013557 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013558 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013560 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013561
13562#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013563}
13564
Victor Stinnerca9381e2015-09-22 00:58:32 +020013565int
13566_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13567 enum PyUnicode_Kind kind)
13568{
13569 Py_UCS4 maxchar;
13570
13571 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13572 assert(writer->kind < kind);
13573
13574 switch (kind)
13575 {
13576 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13577 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13578 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13579 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013580 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013581 }
13582
13583 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13584}
13585
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013586static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013587_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013588{
Victor Stinner2740e462016-09-06 16:58:36 -070013589 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013590 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13591 return -1;
13592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13593 writer->pos++;
13594 return 0;
13595}
13596
13597int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013598_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13599{
13600 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13601}
13602
13603int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013604_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13605{
13606 Py_UCS4 maxchar;
13607 Py_ssize_t len;
13608
13609 if (PyUnicode_READY(str) == -1)
13610 return -1;
13611 len = PyUnicode_GET_LENGTH(str);
13612 if (len == 0)
13613 return 0;
13614 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13615 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013616 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013617 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013618 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013619 Py_INCREF(str);
13620 writer->buffer = str;
13621 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013622 writer->pos += len;
13623 return 0;
13624 }
13625 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13626 return -1;
13627 }
13628 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13629 str, 0, len);
13630 writer->pos += len;
13631 return 0;
13632}
13633
Victor Stinnere215d962012-10-06 23:03:36 +020013634int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013635_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13636 Py_ssize_t start, Py_ssize_t end)
13637{
13638 Py_UCS4 maxchar;
13639 Py_ssize_t len;
13640
13641 if (PyUnicode_READY(str) == -1)
13642 return -1;
13643
13644 assert(0 <= start);
13645 assert(end <= PyUnicode_GET_LENGTH(str));
13646 assert(start <= end);
13647
13648 if (end == 0)
13649 return 0;
13650
13651 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13652 return _PyUnicodeWriter_WriteStr(writer, str);
13653
13654 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13655 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13656 else
13657 maxchar = writer->maxchar;
13658 len = end - start;
13659
13660 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13661 return -1;
13662
13663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13664 str, start, len);
13665 writer->pos += len;
13666 return 0;
13667}
13668
13669int
Victor Stinner4a587072013-11-19 12:54:53 +010013670_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13671 const char *ascii, Py_ssize_t len)
13672{
13673 if (len == -1)
13674 len = strlen(ascii);
13675
13676 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13677
13678 if (writer->buffer == NULL && !writer->overallocate) {
13679 PyObject *str;
13680
13681 str = _PyUnicode_FromASCII(ascii, len);
13682 if (str == NULL)
13683 return -1;
13684
13685 writer->readonly = 1;
13686 writer->buffer = str;
13687 _PyUnicodeWriter_Update(writer);
13688 writer->pos += len;
13689 return 0;
13690 }
13691
13692 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13693 return -1;
13694
13695 switch (writer->kind)
13696 {
13697 case PyUnicode_1BYTE_KIND:
13698 {
13699 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13700 Py_UCS1 *data = writer->data;
13701
Christian Heimesf051e432016-09-13 20:22:02 +020013702 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013703 break;
13704 }
13705 case PyUnicode_2BYTE_KIND:
13706 {
13707 _PyUnicode_CONVERT_BYTES(
13708 Py_UCS1, Py_UCS2,
13709 ascii, ascii + len,
13710 (Py_UCS2 *)writer->data + writer->pos);
13711 break;
13712 }
13713 case PyUnicode_4BYTE_KIND:
13714 {
13715 _PyUnicode_CONVERT_BYTES(
13716 Py_UCS1, Py_UCS4,
13717 ascii, ascii + len,
13718 (Py_UCS4 *)writer->data + writer->pos);
13719 break;
13720 }
13721 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013722 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013723 }
13724
13725 writer->pos += len;
13726 return 0;
13727}
13728
13729int
13730_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13731 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013732{
13733 Py_UCS4 maxchar;
13734
13735 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13736 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13737 return -1;
13738 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13739 writer->pos += len;
13740 return 0;
13741}
13742
Victor Stinnerd3f08822012-05-29 12:57:52 +020013743PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013744_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013745{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013746 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013747
Victor Stinnerd3f08822012-05-29 12:57:52 +020013748 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013749 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013750 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013751 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013752
13753 str = writer->buffer;
13754 writer->buffer = NULL;
13755
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013756 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013757 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13758 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013760
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013761 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13762 PyObject *str2;
13763 str2 = resize_compact(str, writer->pos);
13764 if (str2 == NULL) {
13765 Py_DECREF(str);
13766 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013767 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013768 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013769 }
13770
Victor Stinner15a0bd32013-07-08 22:29:55 +020013771 assert(_PyUnicode_CheckConsistency(str, 1));
13772 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013773}
13774
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013776_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013777{
13778 Py_CLEAR(writer->buffer);
13779}
13780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013782
13783PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013785\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013786Return a formatted version of S, using substitutions from args and kwargs.\n\
13787The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013788
Eric Smith27bbca62010-11-04 17:06:58 +000013789PyDoc_STRVAR(format_map__doc__,
13790 "S.format_map(mapping) -> str\n\
13791\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013792Return a formatted version of S, using substitutions from mapping.\n\
13793The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013794
INADA Naoki3ae20562017-01-16 20:41:20 +090013795/*[clinic input]
13796str.__format__ as unicode___format__
13797
13798 format_spec: unicode
13799 /
13800
13801Return a formatted version of the string as described by format_spec.
13802[clinic start generated code]*/
13803
Eric Smith4a7d76d2008-05-30 18:10:19 +000013804static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013805unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013806/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013807{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013808 _PyUnicodeWriter writer;
13809 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013810
Victor Stinnerd3f08822012-05-29 12:57:52 +020013811 if (PyUnicode_READY(self) == -1)
13812 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013813 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013814 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13815 self, format_spec, 0,
13816 PyUnicode_GET_LENGTH(format_spec));
13817 if (ret == -1) {
13818 _PyUnicodeWriter_Dealloc(&writer);
13819 return NULL;
13820 }
13821 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013822}
13823
INADA Naoki3ae20562017-01-16 20:41:20 +090013824/*[clinic input]
13825str.__sizeof__ as unicode_sizeof
13826
13827Return the size of the string in memory, in bytes.
13828[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013829
13830static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013831unicode_sizeof_impl(PyObject *self)
13832/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013834 Py_ssize_t size;
13835
13836 /* If it's a compact object, account for base structure +
13837 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013838 if (PyUnicode_IS_COMPACT_ASCII(self))
13839 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13840 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013841 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013842 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013843 else {
13844 /* If it is a two-block object, account for base object, and
13845 for character block if present. */
13846 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013847 if (_PyUnicode_DATA_ANY(self))
13848 size += (PyUnicode_GET_LENGTH(self) + 1) *
13849 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013850 }
13851 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013852 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013853 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13854 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13855 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13856 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013857
13858 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013859}
13860
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013861static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013862unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013863{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013864 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865 if (!copy)
13866 return NULL;
13867 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013868}
13869
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013871 UNICODE_ENCODE_METHODDEF
13872 UNICODE_REPLACE_METHODDEF
13873 UNICODE_SPLIT_METHODDEF
13874 UNICODE_RSPLIT_METHODDEF
13875 UNICODE_JOIN_METHODDEF
13876 UNICODE_CAPITALIZE_METHODDEF
13877 UNICODE_CASEFOLD_METHODDEF
13878 UNICODE_TITLE_METHODDEF
13879 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013880 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013881 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013882 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013884 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013885 UNICODE_LJUST_METHODDEF
13886 UNICODE_LOWER_METHODDEF
13887 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013888 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13889 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013890 UNICODE_RJUST_METHODDEF
13891 UNICODE_RSTRIP_METHODDEF
13892 UNICODE_RPARTITION_METHODDEF
13893 UNICODE_SPLITLINES_METHODDEF
13894 UNICODE_STRIP_METHODDEF
13895 UNICODE_SWAPCASE_METHODDEF
13896 UNICODE_TRANSLATE_METHODDEF
13897 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013898 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13899 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013900 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013901 UNICODE_ISLOWER_METHODDEF
13902 UNICODE_ISUPPER_METHODDEF
13903 UNICODE_ISTITLE_METHODDEF
13904 UNICODE_ISSPACE_METHODDEF
13905 UNICODE_ISDECIMAL_METHODDEF
13906 UNICODE_ISDIGIT_METHODDEF
13907 UNICODE_ISNUMERIC_METHODDEF
13908 UNICODE_ISALPHA_METHODDEF
13909 UNICODE_ISALNUM_METHODDEF
13910 UNICODE_ISIDENTIFIER_METHODDEF
13911 UNICODE_ISPRINTABLE_METHODDEF
13912 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013913 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013914 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013915 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013916 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013917 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013918#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013919 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013920 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921#endif
13922
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924 {NULL, NULL}
13925};
13926
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013927static PyObject *
13928unicode_mod(PyObject *v, PyObject *w)
13929{
Brian Curtindfc80e32011-08-10 20:28:54 -050013930 if (!PyUnicode_Check(v))
13931 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013933}
13934
13935static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013936 0, /*nb_add*/
13937 0, /*nb_subtract*/
13938 0, /*nb_multiply*/
13939 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013940};
13941
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 (lenfunc) unicode_length, /* sq_length */
13944 PyUnicode_Concat, /* sq_concat */
13945 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13946 (ssizeargfunc) unicode_getitem, /* sq_item */
13947 0, /* sq_slice */
13948 0, /* sq_ass_item */
13949 0, /* sq_ass_slice */
13950 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951};
13952
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013953static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013954unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013956 if (PyUnicode_READY(self) == -1)
13957 return NULL;
13958
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013959 if (PyIndex_Check(item)) {
13960 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013961 if (i == -1 && PyErr_Occurred())
13962 return NULL;
13963 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013965 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013966 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013967 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013968 PyObject *result;
13969 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013970 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013971 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013972
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013973 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974 return NULL;
13975 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013976 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13977 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013978
13979 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013980 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013981 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013982 slicelength == PyUnicode_GET_LENGTH(self)) {
13983 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013984 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013985 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013986 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013987 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013988 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013989 src_kind = PyUnicode_KIND(self);
13990 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013991 if (!PyUnicode_IS_ASCII(self)) {
13992 kind_limit = kind_maxchar_limit(src_kind);
13993 max_char = 0;
13994 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13995 ch = PyUnicode_READ(src_kind, src_data, cur);
13996 if (ch > max_char) {
13997 max_char = ch;
13998 if (max_char >= kind_limit)
13999 break;
14000 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014001 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014002 }
Victor Stinner55c99112011-10-13 01:17:06 +020014003 else
14004 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014005 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014006 if (result == NULL)
14007 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014008 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014009 dest_data = PyUnicode_DATA(result);
14010
14011 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014012 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14013 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014014 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014015 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014016 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017 } else {
14018 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14019 return NULL;
14020 }
14021}
14022
14023static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 (lenfunc)unicode_length, /* mp_length */
14025 (binaryfunc)unicode_subscript, /* mp_subscript */
14026 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014027};
14028
Guido van Rossumd57fd912000-03-10 22:53:23 +000014029
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030/* Helpers for PyUnicode_Format() */
14031
Victor Stinnera47082312012-10-04 02:19:54 +020014032struct unicode_formatter_t {
14033 PyObject *args;
14034 int args_owned;
14035 Py_ssize_t arglen, argidx;
14036 PyObject *dict;
14037
14038 enum PyUnicode_Kind fmtkind;
14039 Py_ssize_t fmtcnt, fmtpos;
14040 void *fmtdata;
14041 PyObject *fmtstr;
14042
14043 _PyUnicodeWriter writer;
14044};
14045
14046struct unicode_format_arg_t {
14047 Py_UCS4 ch;
14048 int flags;
14049 Py_ssize_t width;
14050 int prec;
14051 int sign;
14052};
14053
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014055unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014056{
Victor Stinnera47082312012-10-04 02:19:54 +020014057 Py_ssize_t argidx = ctx->argidx;
14058
14059 if (argidx < ctx->arglen) {
14060 ctx->argidx++;
14061 if (ctx->arglen < 0)
14062 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014063 else
Victor Stinnera47082312012-10-04 02:19:54 +020014064 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065 }
14066 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014067 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068 return NULL;
14069}
14070
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014071/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072
Victor Stinnera47082312012-10-04 02:19:54 +020014073/* Format a float into the writer if the writer is not NULL, or into *p_output
14074 otherwise.
14075
14076 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014077static int
Victor Stinnera47082312012-10-04 02:19:54 +020014078formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14079 PyObject **p_output,
14080 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014082 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014083 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014085 int prec;
14086 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014087
Guido van Rossumd57fd912000-03-10 22:53:23 +000014088 x = PyFloat_AsDouble(v);
14089 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014090 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014091
Victor Stinnera47082312012-10-04 02:19:54 +020014092 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014094 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014095
Victor Stinnera47082312012-10-04 02:19:54 +020014096 if (arg->flags & F_ALT)
14097 dtoa_flags = Py_DTSF_ALT;
14098 else
14099 dtoa_flags = 0;
14100 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014101 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014102 return -1;
14103 len = strlen(p);
14104 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014105 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014106 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014107 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014108 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 }
14110 else
14111 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014112 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014113 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014114}
14115
Victor Stinnerd0880d52012-04-27 23:40:13 +020014116/* formatlong() emulates the format codes d, u, o, x and X, and
14117 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14118 * Python's regular ints.
14119 * Return value: a new PyUnicodeObject*, or NULL if error.
14120 * The output string is of the form
14121 * "-"? ("0x" | "0X")? digit+
14122 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14123 * set in flags. The case of hex digits will be correct,
14124 * There will be at least prec digits, zero-filled on the left if
14125 * necessary to get that many.
14126 * val object to be converted
14127 * flags bitmask of format flags; only F_ALT is looked at
14128 * prec minimum number of digits; 0-fill on left if needed
14129 * type a character in [duoxX]; u acts the same as d
14130 *
14131 * CAUTION: o, x and X conversions on regular ints can never
14132 * produce a '-' sign, but can for Python's unbounded ints.
14133 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014134PyObject *
14135_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014136{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014137 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014139 Py_ssize_t i;
14140 int sign; /* 1 if '-', else 0 */
14141 int len; /* number of characters */
14142 Py_ssize_t llen;
14143 int numdigits; /* len == numnondigits + numdigits */
14144 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014145
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146 /* Avoid exceeding SSIZE_T_MAX */
14147 if (prec > INT_MAX-3) {
14148 PyErr_SetString(PyExc_OverflowError,
14149 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 }
14152
14153 assert(PyLong_Check(val));
14154
14155 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014156 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014157 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014159 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014161 /* int and int subclasses should print numerically when a numeric */
14162 /* format code is used (see issue18780) */
14163 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014164 break;
14165 case 'o':
14166 numnondigits = 2;
14167 result = PyNumber_ToBase(val, 8);
14168 break;
14169 case 'x':
14170 case 'X':
14171 numnondigits = 2;
14172 result = PyNumber_ToBase(val, 16);
14173 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014174 }
14175 if (!result)
14176 return NULL;
14177
14178 assert(unicode_modifiable(result));
14179 assert(PyUnicode_IS_READY(result));
14180 assert(PyUnicode_IS_ASCII(result));
14181
14182 /* To modify the string in-place, there can only be one reference. */
14183 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014184 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014185 PyErr_BadInternalCall();
14186 return NULL;
14187 }
14188 buf = PyUnicode_DATA(result);
14189 llen = PyUnicode_GET_LENGTH(result);
14190 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014191 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014192 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014193 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 return NULL;
14195 }
14196 len = (int)llen;
14197 sign = buf[0] == '-';
14198 numnondigits += sign;
14199 numdigits = len - numnondigits;
14200 assert(numdigits > 0);
14201
14202 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014203 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 (type == 'o' || type == 'x' || type == 'X'))) {
14205 assert(buf[sign] == '0');
14206 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14207 buf[sign+1] == 'o');
14208 numnondigits -= 2;
14209 buf += 2;
14210 len -= 2;
14211 if (sign)
14212 buf[0] = '-';
14213 assert(len == numnondigits + numdigits);
14214 assert(numdigits > 0);
14215 }
14216
14217 /* Fill with leading zeroes to meet minimum width. */
14218 if (prec > numdigits) {
14219 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14220 numnondigits + prec);
14221 char *b1;
14222 if (!r1) {
14223 Py_DECREF(result);
14224 return NULL;
14225 }
14226 b1 = PyBytes_AS_STRING(r1);
14227 for (i = 0; i < numnondigits; ++i)
14228 *b1++ = *buf++;
14229 for (i = 0; i < prec - numdigits; i++)
14230 *b1++ = '0';
14231 for (i = 0; i < numdigits; i++)
14232 *b1++ = *buf++;
14233 *b1 = '\0';
14234 Py_DECREF(result);
14235 result = r1;
14236 buf = PyBytes_AS_STRING(result);
14237 len = numnondigits + prec;
14238 }
14239
14240 /* Fix up case for hex conversions. */
14241 if (type == 'X') {
14242 /* Need to convert all lower case letters to upper case.
14243 and need to convert 0x to 0X (and -0x to -0X). */
14244 for (i = 0; i < len; i++)
14245 if (buf[i] >= 'a' && buf[i] <= 'x')
14246 buf[i] -= 'a'-'A';
14247 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014248 if (!PyUnicode_Check(result)
14249 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014250 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014251 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014252 Py_DECREF(result);
14253 result = unicode;
14254 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014255 else if (len != PyUnicode_GET_LENGTH(result)) {
14256 if (PyUnicode_Resize(&result, len) < 0)
14257 Py_CLEAR(result);
14258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014260}
14261
Ethan Furmandf3ed242014-01-05 06:50:30 -080014262/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014263 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014264 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014265 * -1 and raise an exception on error */
14266static int
Victor Stinnera47082312012-10-04 02:19:54 +020014267mainformatlong(PyObject *v,
14268 struct unicode_format_arg_t *arg,
14269 PyObject **p_output,
14270 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271{
14272 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014273 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274
14275 if (!PyNumber_Check(v))
14276 goto wrongtype;
14277
Ethan Furman9ab74802014-03-21 06:38:46 -070014278 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014280 if (type == 'o' || type == 'x' || type == 'X') {
14281 iobj = PyNumber_Index(v);
14282 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014283 if (PyErr_ExceptionMatches(PyExc_TypeError))
14284 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014285 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014286 }
14287 }
14288 else {
14289 iobj = PyNumber_Long(v);
14290 if (iobj == NULL ) {
14291 if (PyErr_ExceptionMatches(PyExc_TypeError))
14292 goto wrongtype;
14293 return -1;
14294 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014295 }
14296 assert(PyLong_Check(iobj));
14297 }
14298 else {
14299 iobj = v;
14300 Py_INCREF(iobj);
14301 }
14302
14303 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014304 && arg->width == -1 && arg->prec == -1
14305 && !(arg->flags & (F_SIGN | F_BLANK))
14306 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 {
14308 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014309 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310 int base;
14311
Victor Stinnera47082312012-10-04 02:19:54 +020014312 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014313 {
14314 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014315 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014316 case 'd':
14317 case 'i':
14318 case 'u':
14319 base = 10;
14320 break;
14321 case 'o':
14322 base = 8;
14323 break;
14324 case 'x':
14325 case 'X':
14326 base = 16;
14327 break;
14328 }
14329
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014330 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14331 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014332 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014333 }
14334 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014335 return 1;
14336 }
14337
Ethan Furmanb95b5612015-01-23 20:05:18 -080014338 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014339 Py_DECREF(iobj);
14340 if (res == NULL)
14341 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014342 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 return 0;
14344
14345wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014346 switch(type)
14347 {
14348 case 'o':
14349 case 'x':
14350 case 'X':
14351 PyErr_Format(PyExc_TypeError,
14352 "%%%c format: an integer is required, "
14353 "not %.200s",
14354 type, Py_TYPE(v)->tp_name);
14355 break;
14356 default:
14357 PyErr_Format(PyExc_TypeError,
14358 "%%%c format: a number is required, "
14359 "not %.200s",
14360 type, Py_TYPE(v)->tp_name);
14361 break;
14362 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014363 return -1;
14364}
14365
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014366static Py_UCS4
14367formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014368{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014369 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014370 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014371 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014372 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014373 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014374 goto onError;
14375 }
14376 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014377 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014378 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014379 /* make sure number is a type of integer */
14380 if (!PyLong_Check(v)) {
14381 iobj = PyNumber_Index(v);
14382 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014383 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014384 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014385 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014386 Py_DECREF(iobj);
14387 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014388 else {
14389 x = PyLong_AsLong(v);
14390 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014391 if (x == -1 && PyErr_Occurred())
14392 goto onError;
14393
Victor Stinner8faf8212011-12-08 22:14:11 +010014394 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014395 PyErr_SetString(PyExc_OverflowError,
14396 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014397 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014398 }
14399
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014400 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014402
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014404 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014405 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014406 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014407}
14408
Victor Stinnera47082312012-10-04 02:19:54 +020014409/* Parse options of an argument: flags, width, precision.
14410 Handle also "%(name)" syntax.
14411
14412 Return 0 if the argument has been formatted into arg->str.
14413 Return 1 if the argument has been written into ctx->writer,
14414 Raise an exception and return -1 on error. */
14415static int
14416unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14417 struct unicode_format_arg_t *arg)
14418{
14419#define FORMAT_READ(ctx) \
14420 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14421
14422 PyObject *v;
14423
Victor Stinnera47082312012-10-04 02:19:54 +020014424 if (arg->ch == '(') {
14425 /* Get argument value from a dictionary. Example: "%(name)s". */
14426 Py_ssize_t keystart;
14427 Py_ssize_t keylen;
14428 PyObject *key;
14429 int pcount = 1;
14430
14431 if (ctx->dict == NULL) {
14432 PyErr_SetString(PyExc_TypeError,
14433 "format requires a mapping");
14434 return -1;
14435 }
14436 ++ctx->fmtpos;
14437 --ctx->fmtcnt;
14438 keystart = ctx->fmtpos;
14439 /* Skip over balanced parentheses */
14440 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14441 arg->ch = FORMAT_READ(ctx);
14442 if (arg->ch == ')')
14443 --pcount;
14444 else if (arg->ch == '(')
14445 ++pcount;
14446 ctx->fmtpos++;
14447 }
14448 keylen = ctx->fmtpos - keystart - 1;
14449 if (ctx->fmtcnt < 0 || pcount > 0) {
14450 PyErr_SetString(PyExc_ValueError,
14451 "incomplete format key");
14452 return -1;
14453 }
14454 key = PyUnicode_Substring(ctx->fmtstr,
14455 keystart, keystart + keylen);
14456 if (key == NULL)
14457 return -1;
14458 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014459 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014460 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014461 }
14462 ctx->args = PyObject_GetItem(ctx->dict, key);
14463 Py_DECREF(key);
14464 if (ctx->args == NULL)
14465 return -1;
14466 ctx->args_owned = 1;
14467 ctx->arglen = -1;
14468 ctx->argidx = -2;
14469 }
14470
14471 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014472 while (--ctx->fmtcnt >= 0) {
14473 arg->ch = FORMAT_READ(ctx);
14474 ctx->fmtpos++;
14475 switch (arg->ch) {
14476 case '-': arg->flags |= F_LJUST; continue;
14477 case '+': arg->flags |= F_SIGN; continue;
14478 case ' ': arg->flags |= F_BLANK; continue;
14479 case '#': arg->flags |= F_ALT; continue;
14480 case '0': arg->flags |= F_ZERO; continue;
14481 }
14482 break;
14483 }
14484
14485 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014486 if (arg->ch == '*') {
14487 v = unicode_format_getnextarg(ctx);
14488 if (v == NULL)
14489 return -1;
14490 if (!PyLong_Check(v)) {
14491 PyErr_SetString(PyExc_TypeError,
14492 "* wants int");
14493 return -1;
14494 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014495 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014496 if (arg->width == -1 && PyErr_Occurred())
14497 return -1;
14498 if (arg->width < 0) {
14499 arg->flags |= F_LJUST;
14500 arg->width = -arg->width;
14501 }
14502 if (--ctx->fmtcnt >= 0) {
14503 arg->ch = FORMAT_READ(ctx);
14504 ctx->fmtpos++;
14505 }
14506 }
14507 else if (arg->ch >= '0' && arg->ch <= '9') {
14508 arg->width = arg->ch - '0';
14509 while (--ctx->fmtcnt >= 0) {
14510 arg->ch = FORMAT_READ(ctx);
14511 ctx->fmtpos++;
14512 if (arg->ch < '0' || arg->ch > '9')
14513 break;
14514 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14515 mixing signed and unsigned comparison. Since arg->ch is between
14516 '0' and '9', casting to int is safe. */
14517 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14518 PyErr_SetString(PyExc_ValueError,
14519 "width too big");
14520 return -1;
14521 }
14522 arg->width = arg->width*10 + (arg->ch - '0');
14523 }
14524 }
14525
14526 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014527 if (arg->ch == '.') {
14528 arg->prec = 0;
14529 if (--ctx->fmtcnt >= 0) {
14530 arg->ch = FORMAT_READ(ctx);
14531 ctx->fmtpos++;
14532 }
14533 if (arg->ch == '*') {
14534 v = unicode_format_getnextarg(ctx);
14535 if (v == NULL)
14536 return -1;
14537 if (!PyLong_Check(v)) {
14538 PyErr_SetString(PyExc_TypeError,
14539 "* wants int");
14540 return -1;
14541 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014542 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014543 if (arg->prec == -1 && PyErr_Occurred())
14544 return -1;
14545 if (arg->prec < 0)
14546 arg->prec = 0;
14547 if (--ctx->fmtcnt >= 0) {
14548 arg->ch = FORMAT_READ(ctx);
14549 ctx->fmtpos++;
14550 }
14551 }
14552 else if (arg->ch >= '0' && arg->ch <= '9') {
14553 arg->prec = arg->ch - '0';
14554 while (--ctx->fmtcnt >= 0) {
14555 arg->ch = FORMAT_READ(ctx);
14556 ctx->fmtpos++;
14557 if (arg->ch < '0' || arg->ch > '9')
14558 break;
14559 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14560 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014561 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014562 return -1;
14563 }
14564 arg->prec = arg->prec*10 + (arg->ch - '0');
14565 }
14566 }
14567 }
14568
14569 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14570 if (ctx->fmtcnt >= 0) {
14571 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14572 if (--ctx->fmtcnt >= 0) {
14573 arg->ch = FORMAT_READ(ctx);
14574 ctx->fmtpos++;
14575 }
14576 }
14577 }
14578 if (ctx->fmtcnt < 0) {
14579 PyErr_SetString(PyExc_ValueError,
14580 "incomplete format");
14581 return -1;
14582 }
14583 return 0;
14584
14585#undef FORMAT_READ
14586}
14587
14588/* Format one argument. Supported conversion specifiers:
14589
14590 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014591 - "i", "d", "u": int or float
14592 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014593 - "e", "E", "f", "F", "g", "G": float
14594 - "c": int or str (1 character)
14595
Victor Stinner8dbd4212012-12-04 09:30:24 +010014596 When possible, the output is written directly into the Unicode writer
14597 (ctx->writer). A string is created when padding is required.
14598
Victor Stinnera47082312012-10-04 02:19:54 +020014599 Return 0 if the argument has been formatted into *p_str,
14600 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014601 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014602static int
14603unicode_format_arg_format(struct unicode_formatter_t *ctx,
14604 struct unicode_format_arg_t *arg,
14605 PyObject **p_str)
14606{
14607 PyObject *v;
14608 _PyUnicodeWriter *writer = &ctx->writer;
14609
14610 if (ctx->fmtcnt == 0)
14611 ctx->writer.overallocate = 0;
14612
Victor Stinnera47082312012-10-04 02:19:54 +020014613 v = unicode_format_getnextarg(ctx);
14614 if (v == NULL)
14615 return -1;
14616
Victor Stinnera47082312012-10-04 02:19:54 +020014617
14618 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014619 case 's':
14620 case 'r':
14621 case 'a':
14622 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14623 /* Fast path */
14624 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14625 return -1;
14626 return 1;
14627 }
14628
14629 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14630 *p_str = v;
14631 Py_INCREF(*p_str);
14632 }
14633 else {
14634 if (arg->ch == 's')
14635 *p_str = PyObject_Str(v);
14636 else if (arg->ch == 'r')
14637 *p_str = PyObject_Repr(v);
14638 else
14639 *p_str = PyObject_ASCII(v);
14640 }
14641 break;
14642
14643 case 'i':
14644 case 'd':
14645 case 'u':
14646 case 'o':
14647 case 'x':
14648 case 'X':
14649 {
14650 int ret = mainformatlong(v, arg, p_str, writer);
14651 if (ret != 0)
14652 return ret;
14653 arg->sign = 1;
14654 break;
14655 }
14656
14657 case 'e':
14658 case 'E':
14659 case 'f':
14660 case 'F':
14661 case 'g':
14662 case 'G':
14663 if (arg->width == -1 && arg->prec == -1
14664 && !(arg->flags & (F_SIGN | F_BLANK)))
14665 {
14666 /* Fast path */
14667 if (formatfloat(v, arg, NULL, writer) == -1)
14668 return -1;
14669 return 1;
14670 }
14671
14672 arg->sign = 1;
14673 if (formatfloat(v, arg, p_str, NULL) == -1)
14674 return -1;
14675 break;
14676
14677 case 'c':
14678 {
14679 Py_UCS4 ch = formatchar(v);
14680 if (ch == (Py_UCS4) -1)
14681 return -1;
14682 if (arg->width == -1 && arg->prec == -1) {
14683 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014684 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014685 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014686 return 1;
14687 }
14688 *p_str = PyUnicode_FromOrdinal(ch);
14689 break;
14690 }
14691
14692 default:
14693 PyErr_Format(PyExc_ValueError,
14694 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014695 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014696 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14697 (int)arg->ch,
14698 ctx->fmtpos - 1);
14699 return -1;
14700 }
14701 if (*p_str == NULL)
14702 return -1;
14703 assert (PyUnicode_Check(*p_str));
14704 return 0;
14705}
14706
14707static int
14708unicode_format_arg_output(struct unicode_formatter_t *ctx,
14709 struct unicode_format_arg_t *arg,
14710 PyObject *str)
14711{
14712 Py_ssize_t len;
14713 enum PyUnicode_Kind kind;
14714 void *pbuf;
14715 Py_ssize_t pindex;
14716 Py_UCS4 signchar;
14717 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014718 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014719 Py_ssize_t sublen;
14720 _PyUnicodeWriter *writer = &ctx->writer;
14721 Py_UCS4 fill;
14722
14723 fill = ' ';
14724 if (arg->sign && arg->flags & F_ZERO)
14725 fill = '0';
14726
14727 if (PyUnicode_READY(str) == -1)
14728 return -1;
14729
14730 len = PyUnicode_GET_LENGTH(str);
14731 if ((arg->width == -1 || arg->width <= len)
14732 && (arg->prec == -1 || arg->prec >= len)
14733 && !(arg->flags & (F_SIGN | F_BLANK)))
14734 {
14735 /* Fast path */
14736 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14737 return -1;
14738 return 0;
14739 }
14740
14741 /* Truncate the string for "s", "r" and "a" formats
14742 if the precision is set */
14743 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14744 if (arg->prec >= 0 && len > arg->prec)
14745 len = arg->prec;
14746 }
14747
14748 /* Adjust sign and width */
14749 kind = PyUnicode_KIND(str);
14750 pbuf = PyUnicode_DATA(str);
14751 pindex = 0;
14752 signchar = '\0';
14753 if (arg->sign) {
14754 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14755 if (ch == '-' || ch == '+') {
14756 signchar = ch;
14757 len--;
14758 pindex++;
14759 }
14760 else if (arg->flags & F_SIGN)
14761 signchar = '+';
14762 else if (arg->flags & F_BLANK)
14763 signchar = ' ';
14764 else
14765 arg->sign = 0;
14766 }
14767 if (arg->width < len)
14768 arg->width = len;
14769
14770 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014771 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014772 if (!(arg->flags & F_LJUST)) {
14773 if (arg->sign) {
14774 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014775 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014776 }
14777 else {
14778 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014779 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014780 }
14781 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014782 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14783 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014784 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014785 }
14786
Victor Stinnera47082312012-10-04 02:19:54 +020014787 buflen = arg->width;
14788 if (arg->sign && len == arg->width)
14789 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014790 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014791 return -1;
14792
14793 /* Write the sign if needed */
14794 if (arg->sign) {
14795 if (fill != ' ') {
14796 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14797 writer->pos += 1;
14798 }
14799 if (arg->width > len)
14800 arg->width--;
14801 }
14802
14803 /* Write the numeric prefix for "x", "X" and "o" formats
14804 if the alternate form is used.
14805 For example, write "0x" for the "%#x" format. */
14806 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14807 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14808 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14809 if (fill != ' ') {
14810 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14811 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14812 writer->pos += 2;
14813 pindex += 2;
14814 }
14815 arg->width -= 2;
14816 if (arg->width < 0)
14817 arg->width = 0;
14818 len -= 2;
14819 }
14820
14821 /* Pad left with the fill character if needed */
14822 if (arg->width > len && !(arg->flags & F_LJUST)) {
14823 sublen = arg->width - len;
14824 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14825 writer->pos += sublen;
14826 arg->width = len;
14827 }
14828
14829 /* If padding with spaces: write sign if needed and/or numeric prefix if
14830 the alternate form is used */
14831 if (fill == ' ') {
14832 if (arg->sign) {
14833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14834 writer->pos += 1;
14835 }
14836 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14837 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14838 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14840 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14841 writer->pos += 2;
14842 pindex += 2;
14843 }
14844 }
14845
14846 /* Write characters */
14847 if (len) {
14848 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14849 str, pindex, len);
14850 writer->pos += len;
14851 }
14852
14853 /* Pad right with the fill character if needed */
14854 if (arg->width > len) {
14855 sublen = arg->width - len;
14856 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14857 writer->pos += sublen;
14858 }
14859 return 0;
14860}
14861
14862/* Helper of PyUnicode_Format(): format one arg.
14863 Return 0 on success, raise an exception and return -1 on error. */
14864static int
14865unicode_format_arg(struct unicode_formatter_t *ctx)
14866{
14867 struct unicode_format_arg_t arg;
14868 PyObject *str;
14869 int ret;
14870
Victor Stinner8dbd4212012-12-04 09:30:24 +010014871 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014872 if (arg.ch == '%') {
14873 ctx->fmtpos++;
14874 ctx->fmtcnt--;
14875 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14876 return -1;
14877 return 0;
14878 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014879 arg.flags = 0;
14880 arg.width = -1;
14881 arg.prec = -1;
14882 arg.sign = 0;
14883 str = NULL;
14884
Victor Stinnera47082312012-10-04 02:19:54 +020014885 ret = unicode_format_arg_parse(ctx, &arg);
14886 if (ret == -1)
14887 return -1;
14888
14889 ret = unicode_format_arg_format(ctx, &arg, &str);
14890 if (ret == -1)
14891 return -1;
14892
14893 if (ret != 1) {
14894 ret = unicode_format_arg_output(ctx, &arg, str);
14895 Py_DECREF(str);
14896 if (ret == -1)
14897 return -1;
14898 }
14899
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014900 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014901 PyErr_SetString(PyExc_TypeError,
14902 "not all arguments converted during string formatting");
14903 return -1;
14904 }
14905 return 0;
14906}
14907
Alexander Belopolsky40018472011-02-26 01:02:56 +000014908PyObject *
14909PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014910{
Victor Stinnera47082312012-10-04 02:19:54 +020014911 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014912
Guido van Rossumd57fd912000-03-10 22:53:23 +000014913 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014914 PyErr_BadInternalCall();
14915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014916 }
Victor Stinnera47082312012-10-04 02:19:54 +020014917
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014918 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014919 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014920
14921 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014922 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14923 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14924 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14925 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014926
Victor Stinner8f674cc2013-04-17 23:02:17 +020014927 _PyUnicodeWriter_Init(&ctx.writer);
14928 ctx.writer.min_length = ctx.fmtcnt + 100;
14929 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014930
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014932 ctx.arglen = PyTuple_Size(args);
14933 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014934 }
14935 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014936 ctx.arglen = -1;
14937 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938 }
Victor Stinnera47082312012-10-04 02:19:54 +020014939 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014940 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014941 ctx.dict = args;
14942 else
14943 ctx.dict = NULL;
14944 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945
Victor Stinnera47082312012-10-04 02:19:54 +020014946 while (--ctx.fmtcnt >= 0) {
14947 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014948 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014949
14950 nonfmtpos = ctx.fmtpos++;
14951 while (ctx.fmtcnt >= 0 &&
14952 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14953 ctx.fmtpos++;
14954 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014955 }
Victor Stinnera47082312012-10-04 02:19:54 +020014956 if (ctx.fmtcnt < 0) {
14957 ctx.fmtpos--;
14958 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014959 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014960
Victor Stinnercfc4c132013-04-03 01:48:39 +020014961 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14962 nonfmtpos, ctx.fmtpos) < 0)
14963 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014964 }
14965 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.fmtpos++;
14967 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014968 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014969 }
14970 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014971
Victor Stinnera47082312012-10-04 02:19:54 +020014972 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014973 PyErr_SetString(PyExc_TypeError,
14974 "not all arguments converted during string formatting");
14975 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014976 }
14977
Victor Stinnera47082312012-10-04 02:19:54 +020014978 if (ctx.args_owned) {
14979 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980 }
Victor Stinnera47082312012-10-04 02:19:54 +020014981 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982
Benjamin Peterson29060642009-01-31 22:14:21 +000014983 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014984 _PyUnicodeWriter_Dealloc(&ctx.writer);
14985 if (ctx.args_owned) {
14986 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987 }
14988 return NULL;
14989}
14990
Jeremy Hylton938ace62002-07-17 16:30:39 +000014991static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014992unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14993
Tim Peters6d6c1a32001-08-02 04:15:00 +000014994static PyObject *
14995unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14996{
Benjamin Peterson29060642009-01-31 22:14:21 +000014997 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 static char *kwlist[] = {"object", "encoding", "errors", 0};
14999 char *encoding = NULL;
15000 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015001
Benjamin Peterson14339b62009-01-31 16:36:08 +000015002 if (type != &PyUnicode_Type)
15003 return unicode_subtype_new(type, args, kwds);
15004 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015005 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 return NULL;
15007 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015008 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015009 if (encoding == NULL && errors == NULL)
15010 return PyObject_Str(x);
15011 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015012 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015013}
15014
Guido van Rossume023fe02001-08-30 03:12:59 +000015015static PyObject *
15016unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15017{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015018 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015019 Py_ssize_t length, char_size;
15020 int share_wstr, share_utf8;
15021 unsigned int kind;
15022 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015023
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015025
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015026 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015027 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015029 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015030 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015031 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015032 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015033 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015034
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015035 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015036 if (self == NULL) {
15037 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 return NULL;
15039 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015040 kind = PyUnicode_KIND(unicode);
15041 length = PyUnicode_GET_LENGTH(unicode);
15042
15043 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015044#ifdef Py_DEBUG
15045 _PyUnicode_HASH(self) = -1;
15046#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015048#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 _PyUnicode_STATE(self).interned = 0;
15050 _PyUnicode_STATE(self).kind = kind;
15051 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015052 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015053 _PyUnicode_STATE(self).ready = 1;
15054 _PyUnicode_WSTR(self) = NULL;
15055 _PyUnicode_UTF8_LENGTH(self) = 0;
15056 _PyUnicode_UTF8(self) = NULL;
15057 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015058 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015059
15060 share_utf8 = 0;
15061 share_wstr = 0;
15062 if (kind == PyUnicode_1BYTE_KIND) {
15063 char_size = 1;
15064 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15065 share_utf8 = 1;
15066 }
15067 else if (kind == PyUnicode_2BYTE_KIND) {
15068 char_size = 2;
15069 if (sizeof(wchar_t) == 2)
15070 share_wstr = 1;
15071 }
15072 else {
15073 assert(kind == PyUnicode_4BYTE_KIND);
15074 char_size = 4;
15075 if (sizeof(wchar_t) == 4)
15076 share_wstr = 1;
15077 }
15078
15079 /* Ensure we won't overflow the length. */
15080 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15081 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015082 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015084 data = PyObject_MALLOC((length + 1) * char_size);
15085 if (data == NULL) {
15086 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015087 goto onError;
15088 }
15089
Victor Stinnerc3c74152011-10-02 20:39:55 +020015090 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015091 if (share_utf8) {
15092 _PyUnicode_UTF8_LENGTH(self) = length;
15093 _PyUnicode_UTF8(self) = data;
15094 }
15095 if (share_wstr) {
15096 _PyUnicode_WSTR_LENGTH(self) = length;
15097 _PyUnicode_WSTR(self) = (wchar_t *)data;
15098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015099
Christian Heimesf051e432016-09-13 20:22:02 +020015100 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015101 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015102 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015103#ifdef Py_DEBUG
15104 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15105#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015106 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015107 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015108
15109onError:
15110 Py_DECREF(unicode);
15111 Py_DECREF(self);
15112 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015113}
15114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015115PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015116"str(object='') -> str\n\
15117str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015118\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015119Create a new string object from the given object. If encoding or\n\
15120errors is specified, then the object must expose a data buffer\n\
15121that will be decoded using the given encoding and error handler.\n\
15122Otherwise, returns the result of object.__str__() (if defined)\n\
15123or repr(object).\n\
15124encoding defaults to sys.getdefaultencoding().\n\
15125errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015126
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015127static PyObject *unicode_iter(PyObject *seq);
15128
Guido van Rossumd57fd912000-03-10 22:53:23 +000015129PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015130 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 "str", /* tp_name */
15132 sizeof(PyUnicodeObject), /* tp_size */
15133 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015134 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 (destructor)unicode_dealloc, /* tp_dealloc */
15136 0, /* tp_print */
15137 0, /* tp_getattr */
15138 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015139 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 unicode_repr, /* tp_repr */
15141 &unicode_as_number, /* tp_as_number */
15142 &unicode_as_sequence, /* tp_as_sequence */
15143 &unicode_as_mapping, /* tp_as_mapping */
15144 (hashfunc) unicode_hash, /* tp_hash*/
15145 0, /* tp_call*/
15146 (reprfunc) unicode_str, /* tp_str */
15147 PyObject_GenericGetAttr, /* tp_getattro */
15148 0, /* tp_setattro */
15149 0, /* tp_as_buffer */
15150 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015151 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 unicode_doc, /* tp_doc */
15153 0, /* tp_traverse */
15154 0, /* tp_clear */
15155 PyUnicode_RichCompare, /* tp_richcompare */
15156 0, /* tp_weaklistoffset */
15157 unicode_iter, /* tp_iter */
15158 0, /* tp_iternext */
15159 unicode_methods, /* tp_methods */
15160 0, /* tp_members */
15161 0, /* tp_getset */
15162 &PyBaseObject_Type, /* tp_base */
15163 0, /* tp_dict */
15164 0, /* tp_descr_get */
15165 0, /* tp_descr_set */
15166 0, /* tp_dictoffset */
15167 0, /* tp_init */
15168 0, /* tp_alloc */
15169 unicode_new, /* tp_new */
15170 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015171};
15172
15173/* Initialize the Unicode implementation */
15174
Victor Stinner3a50e702011-10-18 21:21:00 +020015175int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015176{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015177 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015178 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015179 0x000A, /* LINE FEED */
15180 0x000D, /* CARRIAGE RETURN */
15181 0x001C, /* FILE SEPARATOR */
15182 0x001D, /* GROUP SEPARATOR */
15183 0x001E, /* RECORD SEPARATOR */
15184 0x0085, /* NEXT LINE */
15185 0x2028, /* LINE SEPARATOR */
15186 0x2029, /* PARAGRAPH SEPARATOR */
15187 };
15188
Fred Drakee4315f52000-05-09 19:53:39 +000015189 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015190 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015191 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015192 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015193 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015194
Guido van Rossumcacfc072002-05-24 19:01:59 +000015195 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015196 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015197
15198 /* initialize the linebreak bloom filter */
15199 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015200 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015201 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015202
Christian Heimes26532f72013-07-20 14:57:16 +020015203 if (PyType_Ready(&EncodingMapType) < 0)
15204 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015205
Benjamin Petersonc4311282012-10-30 23:21:10 -040015206 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15207 Py_FatalError("Can't initialize field name iterator type");
15208
15209 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15210 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015211
Victor Stinner3a50e702011-10-18 21:21:00 +020015212 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015213}
15214
15215/* Finalize the Unicode implementation */
15216
Christian Heimesa156e092008-02-16 07:38:31 +000015217int
15218PyUnicode_ClearFreeList(void)
15219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015220 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015221}
15222
Guido van Rossumd57fd912000-03-10 22:53:23 +000015223void
Thomas Wouters78890102000-07-22 19:25:51 +000015224_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015225{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015226 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015227
Serhiy Storchaka05997252013-01-26 12:14:02 +020015228 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015229
Serhiy Storchaka05997252013-01-26 12:14:02 +020015230 for (i = 0; i < 256; i++)
15231 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015232 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015233 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015234}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015235
Walter Dörwald16807132007-05-25 13:52:07 +000015236void
15237PyUnicode_InternInPlace(PyObject **p)
15238{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015239 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015241#ifdef Py_DEBUG
15242 assert(s != NULL);
15243 assert(_PyUnicode_CHECK(s));
15244#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015246 return;
15247#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 /* If it's a subclass, we don't really know what putting
15249 it in the interned dict might do. */
15250 if (!PyUnicode_CheckExact(s))
15251 return;
15252 if (PyUnicode_CHECK_INTERNED(s))
15253 return;
15254 if (interned == NULL) {
15255 interned = PyDict_New();
15256 if (interned == NULL) {
15257 PyErr_Clear(); /* Don't leave an exception */
15258 return;
15259 }
15260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015262 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015264 if (t == NULL) {
15265 PyErr_Clear();
15266 return;
15267 }
15268 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015269 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015270 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015271 return;
15272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 /* The two references in interned are not counted by refcnt.
15274 The deallocator will take care of this */
15275 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015276 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015277}
15278
15279void
15280PyUnicode_InternImmortal(PyObject **p)
15281{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 PyUnicode_InternInPlace(p);
15283 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015284 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015285 Py_INCREF(*p);
15286 }
Walter Dörwald16807132007-05-25 13:52:07 +000015287}
15288
15289PyObject *
15290PyUnicode_InternFromString(const char *cp)
15291{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 PyObject *s = PyUnicode_FromString(cp);
15293 if (s == NULL)
15294 return NULL;
15295 PyUnicode_InternInPlace(&s);
15296 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015297}
15298
Alexander Belopolsky40018472011-02-26 01:02:56 +000015299void
15300_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015303 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 Py_ssize_t i, n;
15305 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015306
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 if (interned == NULL || !PyDict_Check(interned))
15308 return;
15309 keys = PyDict_Keys(interned);
15310 if (keys == NULL || !PyList_Check(keys)) {
15311 PyErr_Clear();
15312 return;
15313 }
Walter Dörwald16807132007-05-25 13:52:07 +000015314
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15316 detector, interned unicode strings are not forcibly deallocated;
15317 rather, we give them their stolen references back, and then clear
15318 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015319
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 n = PyList_GET_SIZE(keys);
15321 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015322 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015324 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015325 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015326 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015328 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 case SSTATE_NOT_INTERNED:
15330 /* XXX Shouldn't happen */
15331 break;
15332 case SSTATE_INTERNED_IMMORTAL:
15333 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 break;
15336 case SSTATE_INTERNED_MORTAL:
15337 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015338 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 break;
15340 default:
15341 Py_FatalError("Inconsistent interned string state.");
15342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 }
15345 fprintf(stderr, "total size of all interned strings: "
15346 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15347 "mortal/immortal\n", mortal_size, immortal_size);
15348 Py_DECREF(keys);
15349 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015350 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015351}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015352
15353
15354/********************* Unicode Iterator **************************/
15355
15356typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 PyObject_HEAD
15358 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015359 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360} unicodeiterobject;
15361
15362static void
15363unicodeiter_dealloc(unicodeiterobject *it)
15364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 _PyObject_GC_UNTRACK(it);
15366 Py_XDECREF(it->it_seq);
15367 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368}
15369
15370static int
15371unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 Py_VISIT(it->it_seq);
15374 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015375}
15376
15377static PyObject *
15378unicodeiter_next(unicodeiterobject *it)
15379{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015380 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015381
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 assert(it != NULL);
15383 seq = it->it_seq;
15384 if (seq == NULL)
15385 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015386 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015388 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15389 int kind = PyUnicode_KIND(seq);
15390 void *data = PyUnicode_DATA(seq);
15391 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15392 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 if (item != NULL)
15394 ++it->it_index;
15395 return item;
15396 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015397
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015399 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401}
15402
15403static PyObject *
15404unicodeiter_len(unicodeiterobject *it)
15405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 Py_ssize_t len = 0;
15407 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015408 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410}
15411
15412PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15413
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015414static PyObject *
15415unicodeiter_reduce(unicodeiterobject *it)
15416{
15417 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015418 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015419 it->it_seq, it->it_index);
15420 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015421 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015422 if (u == NULL)
15423 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015424 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015425 }
15426}
15427
15428PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15429
15430static PyObject *
15431unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15432{
15433 Py_ssize_t index = PyLong_AsSsize_t(state);
15434 if (index == -1 && PyErr_Occurred())
15435 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015436 if (it->it_seq != NULL) {
15437 if (index < 0)
15438 index = 0;
15439 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15440 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15441 it->it_index = index;
15442 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015443 Py_RETURN_NONE;
15444}
15445
15446PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15447
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015448static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015450 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15452 reduce_doc},
15453 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15454 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015456};
15457
15458PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15460 "str_iterator", /* tp_name */
15461 sizeof(unicodeiterobject), /* tp_basicsize */
15462 0, /* tp_itemsize */
15463 /* methods */
15464 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15465 0, /* tp_print */
15466 0, /* tp_getattr */
15467 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015468 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 0, /* tp_repr */
15470 0, /* tp_as_number */
15471 0, /* tp_as_sequence */
15472 0, /* tp_as_mapping */
15473 0, /* tp_hash */
15474 0, /* tp_call */
15475 0, /* tp_str */
15476 PyObject_GenericGetAttr, /* tp_getattro */
15477 0, /* tp_setattro */
15478 0, /* tp_as_buffer */
15479 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15480 0, /* tp_doc */
15481 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15482 0, /* tp_clear */
15483 0, /* tp_richcompare */
15484 0, /* tp_weaklistoffset */
15485 PyObject_SelfIter, /* tp_iter */
15486 (iternextfunc)unicodeiter_next, /* tp_iternext */
15487 unicodeiter_methods, /* tp_methods */
15488 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015489};
15490
15491static PyObject *
15492unicode_iter(PyObject *seq)
15493{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015495
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 if (!PyUnicode_Check(seq)) {
15497 PyErr_BadInternalCall();
15498 return NULL;
15499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015500 if (PyUnicode_READY(seq) == -1)
15501 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15503 if (it == NULL)
15504 return NULL;
15505 it->it_index = 0;
15506 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015507 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 _PyObject_GC_TRACK(it);
15509 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015510}
15511
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015512
15513size_t
15514Py_UNICODE_strlen(const Py_UNICODE *u)
15515{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015516 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015517}
15518
15519Py_UNICODE*
15520Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15521{
15522 Py_UNICODE *u = s1;
15523 while ((*u++ = *s2++));
15524 return s1;
15525}
15526
15527Py_UNICODE*
15528Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15529{
15530 Py_UNICODE *u = s1;
15531 while ((*u++ = *s2++))
15532 if (n-- == 0)
15533 break;
15534 return s1;
15535}
15536
15537Py_UNICODE*
15538Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15539{
15540 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015541 u1 += wcslen(u1);
15542 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015543 return s1;
15544}
15545
15546int
15547Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15548{
15549 while (*s1 && *s2 && *s1 == *s2)
15550 s1++, s2++;
15551 if (*s1 && *s2)
15552 return (*s1 < *s2) ? -1 : +1;
15553 if (*s1)
15554 return 1;
15555 if (*s2)
15556 return -1;
15557 return 0;
15558}
15559
15560int
15561Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15562{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015563 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015564 for (; n != 0; n--) {
15565 u1 = *s1;
15566 u2 = *s2;
15567 if (u1 != u2)
15568 return (u1 < u2) ? -1 : +1;
15569 if (u1 == '\0')
15570 return 0;
15571 s1++;
15572 s2++;
15573 }
15574 return 0;
15575}
15576
15577Py_UNICODE*
15578Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15579{
15580 const Py_UNICODE *p;
15581 for (p = s; *p; p++)
15582 if (*p == c)
15583 return (Py_UNICODE*)p;
15584 return NULL;
15585}
15586
15587Py_UNICODE*
15588Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15589{
15590 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015591 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015592 while (p != s) {
15593 p--;
15594 if (*p == c)
15595 return (Py_UNICODE*)p;
15596 }
15597 return NULL;
15598}
Victor Stinner331ea922010-08-10 16:37:20 +000015599
Victor Stinner71133ff2010-09-01 23:43:53 +000015600Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015601PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015602{
Victor Stinner577db2c2011-10-11 22:12:48 +020015603 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015604 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015606 if (!PyUnicode_Check(unicode)) {
15607 PyErr_BadArgument();
15608 return NULL;
15609 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015610 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015611 if (u == NULL)
15612 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015613 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015614 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015615 PyErr_NoMemory();
15616 return NULL;
15617 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015618 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015619 size *= sizeof(Py_UNICODE);
15620 copy = PyMem_Malloc(size);
15621 if (copy == NULL) {
15622 PyErr_NoMemory();
15623 return NULL;
15624 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015625 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015626 return copy;
15627}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015628
Georg Brandl66c221e2010-10-14 07:04:07 +000015629/* A _string module, to export formatter_parser and formatter_field_name_split
15630 to the string.Formatter class implemented in Python. */
15631
15632static PyMethodDef _string_methods[] = {
15633 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15634 METH_O, PyDoc_STR("split the argument as a field name")},
15635 {"formatter_parser", (PyCFunction) formatter_parser,
15636 METH_O, PyDoc_STR("parse the argument as a format string")},
15637 {NULL, NULL}
15638};
15639
15640static struct PyModuleDef _string_module = {
15641 PyModuleDef_HEAD_INIT,
15642 "_string",
15643 PyDoc_STR("string helper module"),
15644 0,
15645 _string_methods,
15646 NULL,
15647 NULL,
15648 NULL,
15649 NULL
15650};
15651
15652PyMODINIT_FUNC
15653PyInit__string(void)
15654{
15655 return PyModule_Create(&_string_module);
15656}
15657
15658
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015659#ifdef __cplusplus
15660}
15661#endif