blob: 5b6b241cb62b68b491ca92fa6470c097730f6769 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010044#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040045#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010046#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050048#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070049#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000050
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Victor Stinnerfecc4f22019-03-19 14:20:29 +010055/* Uncomment to display statistics on interned strings at exit when
56 using Valgrind or Insecure++. */
57/* #define INTERNED_STATS 1 */
58
59
Larry Hastings61272b72014-01-07 12:41:53 -080060/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090061class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080062[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090063/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
64
65/*[python input]
66class Py_UCS4_converter(CConverter):
67 type = 'Py_UCS4'
68 converter = 'convert_uc'
69
70 def converter_init(self):
71 if self.default is not unspecified:
72 self.c_default = ascii(self.default)
73 if len(self.c_default) > 4 or self.c_default[0] != "'":
74 self.c_default = hex(ord(self.default))
75
76[python start generated code]*/
77/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080078
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
Serhiy Storchaka05997252013-01-26 12:14:02 +020081NOTE: In the interpreter's initialization phase, some globals are currently
82 initialized dynamically as needed. In the process Unicode objects may
83 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner8faf8212011-12-08 22:14:11 +010092/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
93#define MAX_UNICODE 0x10ffff
94
Victor Stinner910337b2011-10-03 03:20:16 +020095#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020096# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020097#else
98# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
99#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200100
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101#define _PyUnicode_UTF8(op) \
102 (((PyCompactUnicodeObject*)(op))->utf8)
103#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((char*)((PyASCIIObject*)(op) + 1)) : \
108 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200109#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 (((PyCompactUnicodeObject*)(op))->utf8_length)
111#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200112 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 assert(PyUnicode_IS_READY(op)), \
114 PyUnicode_IS_COMPACT_ASCII(op) ? \
115 ((PyASCIIObject*)(op))->length : \
116 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200117#define _PyUnicode_WSTR(op) \
118 (((PyASCIIObject*)(op))->wstr)
119#define _PyUnicode_WSTR_LENGTH(op) \
120 (((PyCompactUnicodeObject*)(op))->wstr_length)
121#define _PyUnicode_LENGTH(op) \
122 (((PyASCIIObject *)(op))->length)
123#define _PyUnicode_STATE(op) \
124 (((PyASCIIObject *)(op))->state)
125#define _PyUnicode_HASH(op) \
126 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_KIND(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_GET_LENGTH(op) \
131 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200133#define _PyUnicode_DATA_ANY(op) \
134 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135
Victor Stinner910337b2011-10-03 03:20:16 +0200136#undef PyUnicode_READY
137#define PyUnicode_READY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200140 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100141 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200142
Victor Stinnerc379ead2011-10-03 12:52:27 +0200143#define _PyUnicode_SHARE_UTF8(op) \
144 (assert(_PyUnicode_CHECK(op)), \
145 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
146 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
147#define _PyUnicode_SHARE_WSTR(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
150
Victor Stinner829c0ad2011-10-03 01:08:02 +0200151/* true if the Unicode object has an allocated UTF-8 memory block
152 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200153#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200154 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200155 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200156 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
157
Victor Stinner03490912011-10-03 23:45:12 +0200158/* true if the Unicode object has an allocated wstr memory block
159 (not shared with other data) */
160#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200161 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200162 (!PyUnicode_IS_READY(op) || \
163 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
164
Victor Stinner910337b2011-10-03 03:20:16 +0200165/* Generic helper macro to convert characters of different types.
166 from_type and to_type have to be valid type names, begin and end
167 are pointers to the source characters which should be of type
168 "from_type *". to is a pointer of type "to_type *" and points to the
169 buffer where the result characters are written to. */
170#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
171 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100172 to_type *_to = (to_type *)(to); \
173 const from_type *_iter = (from_type *)(begin); \
174 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200175 Py_ssize_t n = (_end) - (_iter); \
176 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200177 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200178 while (_iter < (_unrolled_end)) { \
179 _to[0] = (to_type) _iter[0]; \
180 _to[1] = (to_type) _iter[1]; \
181 _to[2] = (to_type) _iter[2]; \
182 _to[3] = (to_type) _iter[3]; \
183 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200184 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 while (_iter < (_end)) \
186 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200187 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200188
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200189#ifdef MS_WINDOWS
190 /* On Windows, overallocate by 50% is the best factor */
191# define OVERALLOCATE_FACTOR 2
192#else
193 /* On Linux, overallocate by 25% is the best factor */
194# define OVERALLOCATE_FACTOR 4
195#endif
196
Walter Dörwald16807132007-05-25 13:52:07 +0000197/* This dictionary holds all interned unicode strings. Note that references
198 to strings in this dictionary are *not* counted in the string's ob_refcnt.
199 When the interned string reaches a refcnt of 0 the string deallocation
200 function will delete the reference from this dictionary.
201
202 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000203 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000204*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200205static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000206
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 do { \
212 if (unicode_empty != NULL) \
213 Py_INCREF(unicode_empty); \
214 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215 unicode_empty = PyUnicode_New(0, 0); \
216 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200217 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200218 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
219 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Serhiy Storchaka678db842013-01-26 12:16:36 +0200223#define _Py_RETURN_UNICODE_EMPTY() \
224 do { \
225 _Py_INCREF_UNICODE_EMPTY(); \
226 return unicode_empty; \
227 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228
Victor Stinner59423e32018-11-26 13:40:01 +0100229static inline void
230unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
231 Py_ssize_t start, Py_ssize_t length)
232{
233 assert(0 <= start);
234 assert(kind != PyUnicode_WCHAR_KIND);
235 switch (kind) {
236 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100237 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100238 Py_UCS1 ch = (unsigned char)value;
239 Py_UCS1 *to = (Py_UCS1 *)data + start;
240 memset(to, ch, length);
241 break;
242 }
243 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100244 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100245 Py_UCS2 ch = (Py_UCS2)value;
246 Py_UCS2 *to = (Py_UCS2 *)data + start;
247 const Py_UCS2 *end = to + length;
248 for (; to < end; ++to) *to = ch;
249 break;
250 }
251 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100252 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100253 Py_UCS4 ch = value;
254 Py_UCS4 * to = (Py_UCS4 *)data + start;
255 const Py_UCS4 *end = to + length;
256 for (; to < end; ++to) *to = ch;
257 break;
258 }
259 default: Py_UNREACHABLE();
260 }
261}
262
263
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200264/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700265static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
267
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200268/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200269static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200270
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000271/* Single character Unicode strings in the Latin-1 range are being
272 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200273static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000274
Christian Heimes190d79e2008-01-30 11:58:22 +0000275/* Fast detection of the most frequent whitespace characters */
276const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000279/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000280/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000281/* case 0x000C: * FORM FEED */
282/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 1, 1, 1, 1, 1, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000285/* case 0x001C: * FILE SEPARATOR */
286/* case 0x001D: * GROUP SEPARATOR */
287/* case 0x001E: * RECORD SEPARATOR */
288/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 1, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000295
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000304};
305
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200306/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200307static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200308static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100309static int unicode_modifiable(PyObject *unicode);
310
Victor Stinnerfe226c02011-10-03 03:52:20 +0200311
Alexander Belopolsky40018472011-02-26 01:02:56 +0000312static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100313_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200314static PyObject *
315_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
316static PyObject *
317_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
318
319static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000321 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100322 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000323 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
324
Alexander Belopolsky40018472011-02-26 01:02:56 +0000325static void
326raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300327 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100328 PyObject *unicode,
329 Py_ssize_t startpos, Py_ssize_t endpos,
330 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000331
Christian Heimes190d79e2008-01-30 11:58:22 +0000332/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200333static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000335/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000336/* 0x000B, * LINE TABULATION */
337/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000338/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000339 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000340 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000341/* 0x001C, * FILE SEPARATOR */
342/* 0x001D, * GROUP SEPARATOR */
343/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 1, 1, 1, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000349
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000358};
359
INADA Naoki3ae20562017-01-16 20:41:20 +0900360static int convert_uc(PyObject *obj, void *addr);
361
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300362#include "clinic/unicodeobject.c.h"
363
Victor Stinner3d4226a2018-08-29 22:21:32 +0200364_Py_error_handler
365_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200366{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200367 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200368 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 }
370 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200371 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200372 }
373 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200374 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 }
376 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200377 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 }
379 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200380 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
382 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200383 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_OTHER;
389}
390
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300391/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
392 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000393Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000394PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000395{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000396#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000397 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000398#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000399 /* This is actually an illegal character, so it should
400 not be passed to unichr. */
401 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000402#endif
403}
404
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100406_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200407{
408 PyASCIIObject *ascii;
409 unsigned int kind;
410
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200411 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200412
413 ascii = (PyASCIIObject *)op;
414 kind = ascii->state.kind;
415
Victor Stinnera3b334d2011-10-03 13:53:37 +0200416 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200417 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
418 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200419 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200420 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200421 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200422 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200423
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 if (ascii->state.compact == 1) {
425 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200426 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
427 || kind == PyUnicode_2BYTE_KIND
428 || kind == PyUnicode_4BYTE_KIND);
429 _PyObject_ASSERT(op, ascii->state.ascii == 0);
430 _PyObject_ASSERT(op, ascii->state.ready == 1);
431 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100432 }
433 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200434 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
435
436 data = unicode->data.any;
437 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200438 _PyObject_ASSERT(op, ascii->length == 0);
439 _PyObject_ASSERT(op, ascii->hash == -1);
440 _PyObject_ASSERT(op, ascii->state.compact == 0);
441 _PyObject_ASSERT(op, ascii->state.ascii == 0);
442 _PyObject_ASSERT(op, ascii->state.ready == 0);
443 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
444 _PyObject_ASSERT(op, ascii->wstr != NULL);
445 _PyObject_ASSERT(op, data == NULL);
446 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 }
448 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200449 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
450 || kind == PyUnicode_2BYTE_KIND
451 || kind == PyUnicode_4BYTE_KIND);
452 _PyObject_ASSERT(op, ascii->state.compact == 0);
453 _PyObject_ASSERT(op, ascii->state.ready == 1);
454 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200455 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200456 _PyObject_ASSERT(op, compact->utf8 == data);
457 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200458 }
459 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200460 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200461 }
462 }
463 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200464 if (
465#if SIZEOF_WCHAR_T == 2
466 kind == PyUnicode_2BYTE_KIND
467#else
468 kind == PyUnicode_4BYTE_KIND
469#endif
470 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200472 _PyObject_ASSERT(op, ascii->wstr == data);
473 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200474 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200475 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200476 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200477
478 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200479 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200480 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200481 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200482 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200483
484 /* check that the best kind is used: O(n) operation */
485 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200486 Py_ssize_t i;
487 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200488 void *data;
489 Py_UCS4 ch;
490
491 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200492 for (i=0; i < ascii->length; i++)
493 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200494 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200495 if (ch > maxchar)
496 maxchar = ch;
497 }
498 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100499 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200500 _PyObject_ASSERT(op, maxchar >= 128);
501 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100502 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200503 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200504 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200505 }
Victor Stinner77faf692011-11-20 18:56:05 +0100506 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200507 _PyObject_ASSERT(op, maxchar >= 0x100);
508 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100509 }
510 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200511 _PyObject_ASSERT(op, maxchar >= 0x10000);
512 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100513 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200515 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400516 return 1;
517}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518
Victor Stinner910337b2011-10-03 03:20:16 +0200519
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100520static PyObject*
521unicode_result_wchar(PyObject *unicode)
522{
523#ifndef Py_DEBUG
524 Py_ssize_t len;
525
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 len = _PyUnicode_WSTR_LENGTH(unicode);
527 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100528 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200529 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100530 }
531
532 if (len == 1) {
533 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100534 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
536 Py_DECREF(unicode);
537 return latin1_char;
538 }
539 }
540
541 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200542 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 return NULL;
544 }
545#else
Victor Stinneraa771272012-10-04 02:32:58 +0200546 assert(Py_REFCNT(unicode) == 1);
547
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100548 /* don't make the result ready in debug mode to ensure that the caller
549 makes the string ready before using it */
550 assert(_PyUnicode_CheckConsistency(unicode, 1));
551#endif
552 return unicode;
553}
554
555static PyObject*
556unicode_result_ready(PyObject *unicode)
557{
558 Py_ssize_t length;
559
560 length = PyUnicode_GET_LENGTH(unicode);
561 if (length == 0) {
562 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200564 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 }
566 return unicode_empty;
567 }
568
569 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200570 void *data = PyUnicode_DATA(unicode);
571 int kind = PyUnicode_KIND(unicode);
572 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100573 if (ch < 256) {
574 PyObject *latin1_char = unicode_latin1[ch];
575 if (latin1_char != NULL) {
576 if (unicode != latin1_char) {
577 Py_INCREF(latin1_char);
578 Py_DECREF(unicode);
579 }
580 return latin1_char;
581 }
582 else {
583 assert(_PyUnicode_CheckConsistency(unicode, 1));
584 Py_INCREF(unicode);
585 unicode_latin1[ch] = unicode;
586 return unicode;
587 }
588 }
589 }
590
591 assert(_PyUnicode_CheckConsistency(unicode, 1));
592 return unicode;
593}
594
595static PyObject*
596unicode_result(PyObject *unicode)
597{
598 assert(_PyUnicode_CHECK(unicode));
599 if (PyUnicode_IS_READY(unicode))
600 return unicode_result_ready(unicode);
601 else
602 return unicode_result_wchar(unicode);
603}
604
Victor Stinnerc4b49542011-12-11 22:44:26 +0100605static PyObject*
606unicode_result_unchanged(PyObject *unicode)
607{
608 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500609 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100610 return NULL;
611 Py_INCREF(unicode);
612 return unicode;
613 }
614 else
615 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100616 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100617}
618
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200619/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
620 ASCII, Latin1, UTF-8, etc. */
621static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200622backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200623 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
624{
Victor Stinnerad771582015-10-09 12:38:53 +0200625 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200626 Py_UCS4 ch;
627 enum PyUnicode_Kind kind;
628 void *data;
629
630 assert(PyUnicode_IS_READY(unicode));
631 kind = PyUnicode_KIND(unicode);
632 data = PyUnicode_DATA(unicode);
633
634 size = 0;
635 /* determine replacement size */
636 for (i = collstart; i < collend; ++i) {
637 Py_ssize_t incr;
638
639 ch = PyUnicode_READ(kind, data, i);
640 if (ch < 0x100)
641 incr = 2+2;
642 else if (ch < 0x10000)
643 incr = 2+4;
644 else {
645 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200646 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200647 }
648 if (size > PY_SSIZE_T_MAX - incr) {
649 PyErr_SetString(PyExc_OverflowError,
650 "encoded result is too long for a Python string");
651 return NULL;
652 }
653 size += incr;
654 }
655
Victor Stinnerad771582015-10-09 12:38:53 +0200656 str = _PyBytesWriter_Prepare(writer, str, size);
657 if (str == NULL)
658 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659
660 /* generate replacement */
661 for (i = collstart; i < collend; ++i) {
662 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200663 *str++ = '\\';
664 if (ch >= 0x00010000) {
665 *str++ = 'U';
666 *str++ = Py_hexdigits[(ch>>28)&0xf];
667 *str++ = Py_hexdigits[(ch>>24)&0xf];
668 *str++ = Py_hexdigits[(ch>>20)&0xf];
669 *str++ = Py_hexdigits[(ch>>16)&0xf];
670 *str++ = Py_hexdigits[(ch>>12)&0xf];
671 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200672 }
Victor Stinner797485e2015-10-09 03:17:30 +0200673 else if (ch >= 0x100) {
674 *str++ = 'u';
675 *str++ = Py_hexdigits[(ch>>12)&0xf];
676 *str++ = Py_hexdigits[(ch>>8)&0xf];
677 }
678 else
679 *str++ = 'x';
680 *str++ = Py_hexdigits[(ch>>4)&0xf];
681 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200682 }
683 return str;
684}
685
686/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
687 ASCII, Latin1, UTF-8, etc. */
688static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200689xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200690 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
691{
Victor Stinnerad771582015-10-09 12:38:53 +0200692 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200693 Py_UCS4 ch;
694 enum PyUnicode_Kind kind;
695 void *data;
696
697 assert(PyUnicode_IS_READY(unicode));
698 kind = PyUnicode_KIND(unicode);
699 data = PyUnicode_DATA(unicode);
700
701 size = 0;
702 /* determine replacement size */
703 for (i = collstart; i < collend; ++i) {
704 Py_ssize_t incr;
705
706 ch = PyUnicode_READ(kind, data, i);
707 if (ch < 10)
708 incr = 2+1+1;
709 else if (ch < 100)
710 incr = 2+2+1;
711 else if (ch < 1000)
712 incr = 2+3+1;
713 else if (ch < 10000)
714 incr = 2+4+1;
715 else if (ch < 100000)
716 incr = 2+5+1;
717 else if (ch < 1000000)
718 incr = 2+6+1;
719 else {
720 assert(ch <= MAX_UNICODE);
721 incr = 2+7+1;
722 }
723 if (size > PY_SSIZE_T_MAX - incr) {
724 PyErr_SetString(PyExc_OverflowError,
725 "encoded result is too long for a Python string");
726 return NULL;
727 }
728 size += incr;
729 }
730
Victor Stinnerad771582015-10-09 12:38:53 +0200731 str = _PyBytesWriter_Prepare(writer, str, size);
732 if (str == NULL)
733 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 /* generate replacement */
736 for (i = collstart; i < collend; ++i) {
737 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
738 }
739 return str;
740}
741
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742/* --- Bloom Filters ----------------------------------------------------- */
743
744/* stuff to implement simple "bloom filters" for Unicode characters.
745 to keep things simple, we use a single bitmask, using the least 5
746 bits from each unicode characters as the bit index. */
747
748/* the linebreak mask is set up by Unicode_Init below */
749
Antoine Pitrouf068f942010-01-13 14:19:12 +0000750#if LONG_BIT >= 128
751#define BLOOM_WIDTH 128
752#elif LONG_BIT >= 64
753#define BLOOM_WIDTH 64
754#elif LONG_BIT >= 32
755#define BLOOM_WIDTH 32
756#else
757#error "LONG_BIT is smaller than 32"
758#endif
759
Thomas Wouters477c8d52006-05-27 19:21:47 +0000760#define BLOOM_MASK unsigned long
761
Serhiy Storchaka05997252013-01-26 12:14:02 +0200762static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000763
Antoine Pitrouf068f942010-01-13 14:19:12 +0000764#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000765
Benjamin Peterson29060642009-01-31 22:14:21 +0000766#define BLOOM_LINEBREAK(ch) \
767 ((ch) < 128U ? ascii_linebreak[(ch)] : \
768 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000769
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700770static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772{
Victor Stinnera85af502013-04-09 21:53:54 +0200773#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
774 do { \
775 TYPE *data = (TYPE *)PTR; \
776 TYPE *end = data + LEN; \
777 Py_UCS4 ch; \
778 for (; data != end; data++) { \
779 ch = *data; \
780 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
781 } \
782 break; \
783 } while (0)
784
Thomas Wouters477c8d52006-05-27 19:21:47 +0000785 /* calculate simple bloom-style bitmask for a given unicode string */
786
Antoine Pitrouf068f942010-01-13 14:19:12 +0000787 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000788
789 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200790 switch (kind) {
791 case PyUnicode_1BYTE_KIND:
792 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
793 break;
794 case PyUnicode_2BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
796 break;
797 case PyUnicode_4BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
799 break;
800 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700801 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200802 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200804
805#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806}
807
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300808static int
809ensure_unicode(PyObject *obj)
810{
811 if (!PyUnicode_Check(obj)) {
812 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200813 "must be str, not %.100s",
814 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300815 return -1;
816 }
817 return PyUnicode_READY(obj);
818}
819
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200820/* Compilation of templated routines */
821
822#include "stringlib/asciilib.h"
823#include "stringlib/fastsearch.h"
824#include "stringlib/partition.h"
825#include "stringlib/split.h"
826#include "stringlib/count.h"
827#include "stringlib/find.h"
828#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200829#include "stringlib/undef.h"
830
831#include "stringlib/ucs1lib.h"
832#include "stringlib/fastsearch.h"
833#include "stringlib/partition.h"
834#include "stringlib/split.h"
835#include "stringlib/count.h"
836#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300837#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200838#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200839#include "stringlib/undef.h"
840
841#include "stringlib/ucs2lib.h"
842#include "stringlib/fastsearch.h"
843#include "stringlib/partition.h"
844#include "stringlib/split.h"
845#include "stringlib/count.h"
846#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300847#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200848#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200849#include "stringlib/undef.h"
850
851#include "stringlib/ucs4lib.h"
852#include "stringlib/fastsearch.h"
853#include "stringlib/partition.h"
854#include "stringlib/split.h"
855#include "stringlib/count.h"
856#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300857#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200858#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200859#include "stringlib/undef.h"
860
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200861#include "stringlib/unicodedefs.h"
862#include "stringlib/fastsearch.h"
863#include "stringlib/count.h"
864#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100865#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867/* --- Unicode Object ----------------------------------------------------- */
868
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700869static inline Py_ssize_t
870findchar(const void *s, int kind,
871 Py_ssize_t size, Py_UCS4 ch,
872 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200874 switch (kind) {
875 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200876 if ((Py_UCS1) ch != ch)
877 return -1;
878 if (direction > 0)
879 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
880 else
881 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200882 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200883 if ((Py_UCS2) ch != ch)
884 return -1;
885 if (direction > 0)
886 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
887 else
888 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200889 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200890 if (direction > 0)
891 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
892 else
893 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200894 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700895 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897}
898
Victor Stinnerafffce42012-10-03 23:03:17 +0200899#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000900/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200901 earlier.
902
903 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
904 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
905 invalid character in Unicode 6.0. */
906static void
907unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
908{
909 int kind = PyUnicode_KIND(unicode);
910 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
911 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
912 if (length <= old_length)
913 return;
914 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
915}
916#endif
917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918static PyObject*
919resize_compact(PyObject *unicode, Py_ssize_t length)
920{
921 Py_ssize_t char_size;
922 Py_ssize_t struct_size;
923 Py_ssize_t new_size;
924 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100925 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200926#ifdef Py_DEBUG
927 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
928#endif
929
Victor Stinner79891572012-05-03 13:43:07 +0200930 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100932 assert(PyUnicode_IS_COMPACT(unicode));
933
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200934 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100935 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200936 struct_size = sizeof(PyASCIIObject);
937 else
938 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200939 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
942 PyErr_NoMemory();
943 return NULL;
944 }
945 new_size = (struct_size + (length + 1) * char_size);
946
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200947 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_UTF8(unicode));
949 _PyUnicode_UTF8(unicode) = NULL;
950 _PyUnicode_UTF8_LENGTH(unicode) = 0;
951 }
Victor Stinner84def372011-12-11 20:04:56 +0100952 _Py_DEC_REFTOTAL;
953 _Py_ForgetReference(unicode);
954
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300955 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100956 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100957 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200958 PyErr_NoMemory();
959 return NULL;
960 }
Victor Stinner84def372011-12-11 20:04:56 +0100961 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200962 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100963
Victor Stinnerfe226c02011-10-03 03:52:20 +0200964 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200965 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100967 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200968 _PyUnicode_WSTR_LENGTH(unicode) = length;
969 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100970 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
971 PyObject_DEL(_PyUnicode_WSTR(unicode));
972 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100973 if (!PyUnicode_IS_ASCII(unicode))
974 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100975 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200976#ifdef Py_DEBUG
977 unicode_fill_invalid(unicode, old_length);
978#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
980 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200981 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982 return unicode;
983}
984
Alexander Belopolsky40018472011-02-26 01:02:56 +0000985static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200986resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987{
Victor Stinner95663112011-10-04 01:03:50 +0200988 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200991 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000992
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 if (PyUnicode_IS_READY(unicode)) {
994 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200995 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200997#ifdef Py_DEBUG
998 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
999#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000
1001 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001002 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001003 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1004 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001005
1006 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1007 PyErr_NoMemory();
1008 return -1;
1009 }
1010 new_size = (length + 1) * char_size;
1011
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1013 {
1014 PyObject_DEL(_PyUnicode_UTF8(unicode));
1015 _PyUnicode_UTF8(unicode) = NULL;
1016 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1017 }
1018
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 data = (PyObject *)PyObject_REALLOC(data, new_size);
1020 if (data == NULL) {
1021 PyErr_NoMemory();
1022 return -1;
1023 }
1024 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001025 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 _PyUnicode_WSTR_LENGTH(unicode) = length;
1028 }
1029 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001030 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001031 _PyUnicode_UTF8_LENGTH(unicode) = length;
1032 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 _PyUnicode_LENGTH(unicode) = length;
1034 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001035#ifdef Py_DEBUG
1036 unicode_fill_invalid(unicode, old_length);
1037#endif
Victor Stinner95663112011-10-04 01:03:50 +02001038 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001039 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042 }
Victor Stinner95663112011-10-04 01:03:50 +02001043 assert(_PyUnicode_WSTR(unicode) != NULL);
1044
1045 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001046 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001047 PyErr_NoMemory();
1048 return -1;
1049 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001050 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001051 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001052 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001053 if (!wstr) {
1054 PyErr_NoMemory();
1055 return -1;
1056 }
1057 _PyUnicode_WSTR(unicode) = wstr;
1058 _PyUnicode_WSTR(unicode)[length] = 0;
1059 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001060 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 return 0;
1062}
1063
Victor Stinnerfe226c02011-10-03 03:52:20 +02001064static PyObject*
1065resize_copy(PyObject *unicode, Py_ssize_t length)
1066{
1067 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001068 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001070
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001071 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072
1073 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1074 if (copy == NULL)
1075 return NULL;
1076
1077 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001078 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001080 }
1081 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001082 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001083
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (w == NULL)
1086 return NULL;
1087 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1088 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001089 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001090 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001091 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 }
1093}
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001096 Ux0000 terminated; some code (e.g. new_identifier)
1097 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098
1099 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001100 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102*/
1103
Alexander Belopolsky40018472011-02-26 01:02:56 +00001104static PyUnicodeObject *
1105_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001107 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109
Thomas Wouters477c8d52006-05-27 19:21:47 +00001110 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (length == 0 && unicode_empty != NULL) {
1112 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001113 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
1115
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001116 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001117 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001118 return (PyUnicodeObject *)PyErr_NoMemory();
1119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 if (length < 0) {
1121 PyErr_SetString(PyExc_SystemError,
1122 "Negative size passed to _PyUnicode_New");
1123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 }
1125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1127 if (unicode == NULL)
1128 return NULL;
1129 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001130
1131 _PyUnicode_WSTR_LENGTH(unicode) = length;
1132 _PyUnicode_HASH(unicode) = -1;
1133 _PyUnicode_STATE(unicode).interned = 0;
1134 _PyUnicode_STATE(unicode).kind = 0;
1135 _PyUnicode_STATE(unicode).compact = 0;
1136 _PyUnicode_STATE(unicode).ready = 0;
1137 _PyUnicode_STATE(unicode).ascii = 0;
1138 _PyUnicode_DATA_ANY(unicode) = NULL;
1139 _PyUnicode_LENGTH(unicode) = 0;
1140 _PyUnicode_UTF8(unicode) = NULL;
1141 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1144 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001145 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001147 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149
Jeremy Hyltond8082792003-09-16 19:41:39 +00001150 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001151 * the caller fails before initializing str -- unicode_resize()
1152 * reads str[0], and the Keep-Alive optimization can keep memory
1153 * allocated for str alive across a call to unicode_dealloc(unicode).
1154 * We don't want unicode_resize to read uninitialized memory in
1155 * that case.
1156 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 _PyUnicode_WSTR(unicode)[0] = 0;
1158 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001159
Victor Stinner7931d9a2011-11-04 00:22:48 +01001160 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 return unicode;
1162}
1163
Victor Stinnerf42dc442011-10-02 23:33:16 +02001164static const char*
1165unicode_kind_name(PyObject *unicode)
1166{
Victor Stinner42dfd712011-10-03 14:41:45 +02001167 /* don't check consistency: unicode_kind_name() is called from
1168 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 if (!PyUnicode_IS_COMPACT(unicode))
1170 {
1171 if (!PyUnicode_IS_READY(unicode))
1172 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001173 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 {
1175 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001176 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 return "legacy ascii";
1178 else
1179 return "legacy latin1";
1180 case PyUnicode_2BYTE_KIND:
1181 return "legacy UCS2";
1182 case PyUnicode_4BYTE_KIND:
1183 return "legacy UCS4";
1184 default:
1185 return "<legacy invalid kind>";
1186 }
1187 }
1188 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001189 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001190 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001191 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001192 return "ascii";
1193 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001194 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001195 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001196 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001197 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001198 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001199 default:
1200 return "<invalid compact kind>";
1201 }
1202}
1203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001206char *_PyUnicode_utf8(void *unicode_raw){
1207 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001208 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209}
1210
Victor Stinnera42de742018-11-22 10:25:22 +01001211void *_PyUnicode_compact_data(void *unicode_raw) {
1212 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 return _PyUnicode_COMPACT_DATA(unicode);
1214}
Victor Stinnera42de742018-11-22 10:25:22 +01001215void *_PyUnicode_data(void *unicode_raw) {
1216 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001225
1226void
1227_PyUnicode_Dump(PyObject *op)
1228{
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001233
Victor Stinnera849a4b2011-10-03 12:12:11 +02001234 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001241 else
1242 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001245
Victor Stinnera849a4b2011-10-03 12:12:11 +02001246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001249
Victor Stinnera3b334d2011-10-03 13:53:37 +02001250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001256 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001257 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001258}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259#endif
1260
1261PyObject *
1262PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263{
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001267 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001268 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001275 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 }
1277
Victor Stinner9e9d6892011-10-04 01:02:02 +02001278 is_ascii = 0;
1279 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001282 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001288 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001292 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001337 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
Victor Stinner8f825062012-04-27 13:55:39 +02001345 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001350 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 else {
1353 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001354 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001357 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
Victor Stinner8f825062012-04-27 13:55:39 +02001368#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001369 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001370#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 return obj;
1373}
1374
1375#if SIZEOF_WCHAR_T == 2
1376/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001378 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001382static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001384 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385{
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
Victor Stinner910337b2011-10-03 03:20:16 +02001389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 {
Victor Stinner551ac952011-11-29 22:58:13 +01001401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412}
1413#endif
1414
Victor Stinnercd9950f2011-10-02 00:34:53 +02001415static int
Victor Stinner488fa492011-12-12 00:01:39 +01001416unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001417{
Victor Stinner488fa492011-12-12 00:01:39 +01001418 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001419 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001420 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001421 return -1;
1422 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001423 return 0;
1424}
1425
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001426static int
1427_copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433
Victor Stinneree4544c2012-05-09 22:24:08 +02001434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001438 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440
Victor Stinnerd3f08822012-05-29 12:57:52 +02001441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 if (how_many == 0)
1446 return 0;
1447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001449 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001451 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
Victor Stinnerf1852262012-06-16 16:38:26 +02001453#ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465#endif
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001478 }
Christian Heimesf051e432016-09-13 20:22:02 +02001479 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001492 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001493 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001512 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001548 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001549 }
1550 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001551 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001554 Py_ssize_t i;
1555
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001558 if (ch > to_maxchar)
1559 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001562 }
1563 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return 0;
1565}
1566
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567void
1568_PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571{
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573}
1574
1575Py_ssize_t
1576PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579{
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
Benjamin Petersonbac79492012-01-14 13:34:47 -05001587 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001588 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001589 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 return -1;
1591
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001628 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629}
1630
Victor Stinner17222162011-09-28 22:15:37 +02001631/* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636static int
1637find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639{
1640 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001641 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642
Victor Stinnerc53be962011-10-02 21:33:54 +02001643 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 }
1657 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 return 0;
1674}
1675
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001676int
1677_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682#if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684#endif
1685
Georg Brandl7597add2011-10-05 16:36:47 +02001686 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001700 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702
1703 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001716 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001721 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
Victor Stinner506f5922011-09-28 22:34:18 +02001735#if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001743#else
1744 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001748 PyErr_NoMemory();
1749 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 }
Victor Stinner506f5922011-09-28 22:34:18 +02001751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766#if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789#else
1790 assert(num_surrogates == 0);
1791
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797#endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001801 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return 0;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001806unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Walter Dörwald16807132007-05-25 13:52:07 +00001808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 case SSTATE_NOT_INTERNED:
1810 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001811
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001815 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001819
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001822 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001823
Benjamin Peterson29060642009-01-31 22:14:21 +00001824 default:
1825 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001826 }
1827
Victor Stinner03490912011-10-03 23:45:12 +02001828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001835 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836}
1837
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001838#ifdef Py_DEBUG
1839static int
1840unicode_is_singleton(PyObject *unicode)
1841{
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852}
1853#endif
1854
Alexander Belopolsky40018472011-02-26 01:02:56 +00001855static int
Victor Stinner488fa492011-12-12 00:01:39 +01001856unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001857{
Victor Stinner488fa492011-12-12 00:01:39 +01001858 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001867#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001870#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return 1;
1872}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874static int
1875unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876{
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
Victor Stinner910337b2011-10-03 03:20:16 +02001887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001894 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001898 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001899 return 0;
1900 }
1901
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001906 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001908 }
1909
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001914 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001915 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001916 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001917 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001918}
1919
Alexander Belopolsky40018472011-02-26 01:02:56 +00001920int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001921PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001922{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001935}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001936
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001937/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001938
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001942static void
1943unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945{
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001948 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001953#ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001961 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001962 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
Victor Stinner184252a2012-06-16 02:57:41 +02001969 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001973 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
Victor Stinner184252a2012-06-16 02:57:41 +02001981 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001985 }
1986 }
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989static PyObject*
1990get_latin1_char(unsigned char ch)
1991{
Victor Stinnera464fc12011-10-02 20:39:30 +02001992 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001994 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002002 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003}
2004
Victor Stinner985a82a2014-01-03 12:53:47 +01002005static PyObject*
2006unicode_char(Py_UCS4 ch)
2007{
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
Victor Stinner985a82a2014-01-03 12:53:47 +01002015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002022 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028}
2029
Alexander Belopolsky40018472011-02-26 01:02:56 +00002030PyObject *
2031PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042}
2043
2044PyObject *
2045PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002047 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002069 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 return NULL;
2077
Victor Stinner8faf8212011-12-08 22:14:11 +01002078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (!unicode)
2080 return NULL;
2081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093#endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096#if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100#else
2101 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103#endif
2104 break;
2105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002106 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002109 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110}
2111
Alexander Belopolsky40018472011-02-26 01:02:56 +00002112PyObject *
2113PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002114{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 return NULL;
2119 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002128{
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002135}
2136
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137PyObject *
2138_PyUnicode_FromId(_Py_Identifier *id)
2139{
2140 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002151 return id->object;
2152}
2153
2154void
2155_PyUnicode_ClearStaticStrings()
2156{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002159 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002163 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002164 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002165}
2166
Benjamin Peterson0df54292012-03-26 14:50:32 -04002167/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168
Victor Stinnerd3f08822012-05-29 12:57:52 +02002169PyObject*
2170_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002171{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002172 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002173 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002174 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002175#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002176 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002177#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002178 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002179 }
Victor Stinner785938e2011-12-11 20:09:03 +01002180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002182 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002186}
2187
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002188static Py_UCS4
2189kind_maxchar_limit(unsigned int kind)
2190{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002191 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002199 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002200 }
2201}
2202
Victor Stinner702c7342011-10-05 13:50:52 +02002203static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002204_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208
Serhiy Storchaka678db842013-01-26 12:16:36 +02002209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002212 if (size == 1)
2213 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002215 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002220 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002222}
2223
Victor Stinnere57b1c02011-09-28 22:20:48 +02002224static PyObject*
2225_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226{
2227 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002228 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229
Serhiy Storchaka678db842013-01-26 12:16:36 +02002230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002232 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002233 if (size == 1)
2234 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002235
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002236 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!res)
2239 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002240 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002246 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return res;
2248}
2249
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250static PyObject*
2251_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252{
2253 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002254 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255
Serhiy Storchaka678db842013-01-26 12:16:36 +02002256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002259 if (size == 1)
2260 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002262 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002263 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 if (!res)
2265 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return res;
2276}
2277
2278PyObject*
2279PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002285 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002291 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296}
2297
Victor Stinnerece58de2012-04-23 23:36:38 +02002298Py_UCS4
2299_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300{
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
Victor Stinner94d558b2012-04-27 22:26:58 +02002315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002319 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002330 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002331 }
2332}
2333
Victor Stinner25a4b292011-10-06 12:31:55 +02002334/* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002337static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002338unicode_adjust_maxchar(PyObject **p_unicode)
2339{
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 }
2365 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002367 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002372 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377}
2378
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002380_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381{
Victor Stinner87af4f22011-11-21 23:03:47 +01002382 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002384
Victor Stinner034f6cf2011-09-30 02:26:44 +02002385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002389 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002390 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002391
Victor Stinner87af4f22011-11-21 23:03:47 +01002392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
Christian Heimesf051e432016-09-13 20:22:02 +02002398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002399 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002400 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002401 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002402}
2403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405/* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407
2408void*
2409_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
Benjamin Petersonbac79492012-01-14 13:34:47 -05002415 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return NULL;
2423 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002424 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002456 default:
2457 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 }
Victor Stinner01698042011-10-04 00:04:26 +02002459 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 return NULL;
2461}
2462
2463static Py_UCS4*
2464as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002479 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002515 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520}
2521
2522Py_UCS4*
2523PyUnicode_AsUCS4Copy(PyObject *string)
2524{
2525 return as_ucs4(string, NULL, 0, 1);
2526}
2527
Victor Stinner15a11362012-10-06 23:48:20 +02002528/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002532
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533static int
2534unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536{
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571}
2572
2573static int
Victor Stinner998b8062018-09-12 00:23:25 +02002574unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 Py_ssize_t width, Py_ssize_t precision)
2576{
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002582 if (precision == -1) {
2583 length = strlen(str);
2584 }
2585 else {
2586 length = 0;
2587 while (length < precision && str[length]) {
2588 length++;
2589 }
2590 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2592 if (unicode == NULL)
2593 return -1;
2594
2595 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2596 Py_DECREF(unicode);
2597 return res;
2598}
2599
Victor Stinner96865452011-03-01 23:44:09 +00002600static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002601unicode_fromformat_arg(_PyUnicodeWriter *writer,
2602 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002603{
Victor Stinnere215d962012-10-06 23:03:36 +02002604 const char *p;
2605 Py_ssize_t len;
2606 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 Py_ssize_t width;
2608 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002609 int longflag;
2610 int longlongflag;
2611 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002613
2614 p = f;
2615 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002616 zeropad = 0;
2617 if (*f == '0') {
2618 zeropad = 1;
2619 f++;
2620 }
Victor Stinner96865452011-03-01 23:44:09 +00002621
2622 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 width = -1;
2624 if (Py_ISDIGIT((unsigned)*f)) {
2625 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002626 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002627 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002629 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002631 return NULL;
2632 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002633 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002634 f++;
2635 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002636 }
2637 precision = -1;
2638 if (*f == '.') {
2639 f++;
2640 if (Py_ISDIGIT((unsigned)*f)) {
2641 precision = (*f - '0');
2642 f++;
2643 while (Py_ISDIGIT((unsigned)*f)) {
2644 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2645 PyErr_SetString(PyExc_ValueError,
2646 "precision too big");
2647 return NULL;
2648 }
2649 precision = (precision * 10) + (*f - '0');
2650 f++;
2651 }
2652 }
Victor Stinner96865452011-03-01 23:44:09 +00002653 if (*f == '%') {
2654 /* "%.3%s" => f points to "3" */
2655 f--;
2656 }
2657 }
2658 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002660 f--;
2661 }
Victor Stinner96865452011-03-01 23:44:09 +00002662
2663 /* Handle %ld, %lu, %lld and %llu. */
2664 longflag = 0;
2665 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002666 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002667 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002668 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002669 longflag = 1;
2670 ++f;
2671 }
Victor Stinner96865452011-03-01 23:44:09 +00002672 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002673 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002674 longlongflag = 1;
2675 f += 2;
2676 }
Victor Stinner96865452011-03-01 23:44:09 +00002677 }
2678 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002679 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002680 size_tflag = 1;
2681 ++f;
2682 }
Victor Stinnere215d962012-10-06 23:03:36 +02002683
2684 if (f[1] == '\0')
2685 writer->overallocate = 0;
2686
2687 switch (*f) {
2688 case 'c':
2689 {
2690 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002691 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002692 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002693 "character argument not in range(0x110000)");
2694 return NULL;
2695 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002696 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002697 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002698 break;
2699 }
2700
2701 case 'i':
2702 case 'd':
2703 case 'u':
2704 case 'x':
2705 {
2706 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002707 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002708 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002709
2710 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, size_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, unsigned int));
2723 }
2724 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002725 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002726 }
2727 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002728 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002729 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002730 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002731 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002732 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002733 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002734 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002735 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002736 va_arg(*vargs, Py_ssize_t));
2737 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002738 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002739 va_arg(*vargs, int));
2740 }
2741 assert(len >= 0);
2742
Victor Stinnere215d962012-10-06 23:03:36 +02002743 if (precision < len)
2744 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745
2746 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2748 return NULL;
2749
Victor Stinnere215d962012-10-06 23:03:36 +02002750 if (width > precision) {
2751 Py_UCS4 fillchar;
2752 fill = width - precision;
2753 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2755 return NULL;
2756 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 }
Victor Stinner15a11362012-10-06 23:48:20 +02002758 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002759 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002760 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2761 return NULL;
2762 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002763 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764
Victor Stinner4a587072013-11-19 12:54:53 +01002765 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2766 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002767 break;
2768 }
2769
2770 case 'p':
2771 {
2772 char number[MAX_LONG_LONG_CHARS];
2773
2774 len = sprintf(number, "%p", va_arg(*vargs, void*));
2775 assert(len >= 0);
2776
2777 /* %p is ill-defined: ensure leading 0x. */
2778 if (number[1] == 'X')
2779 number[1] = 'x';
2780 else if (number[1] != 'x') {
2781 memmove(number + 2, number,
2782 strlen(number) + 1);
2783 number[0] = '0';
2784 number[1] = 'x';
2785 len += 2;
2786 }
2787
Victor Stinner4a587072013-11-19 12:54:53 +01002788 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002789 return NULL;
2790 break;
2791 }
2792
2793 case 's':
2794 {
2795 /* UTF-8 */
2796 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002797 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002798 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002799 break;
2800 }
2801
2802 case 'U':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 assert(obj && _PyUnicode_CHECK(obj));
2806
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002808 return NULL;
2809 break;
2810 }
2811
2812 case 'V':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002816 if (obj) {
2817 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002818 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002819 return NULL;
2820 }
2821 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002822 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002823 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002824 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002825 }
2826 break;
2827 }
2828
2829 case 'S':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *str;
2833 assert(obj);
2834 str = PyObject_Str(obj);
2835 if (!str)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(str);
2839 return NULL;
2840 }
2841 Py_DECREF(str);
2842 break;
2843 }
2844
2845 case 'R':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 PyObject *repr;
2849 assert(obj);
2850 repr = PyObject_Repr(obj);
2851 if (!repr)
2852 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002854 Py_DECREF(repr);
2855 return NULL;
2856 }
2857 Py_DECREF(repr);
2858 break;
2859 }
2860
2861 case 'A':
2862 {
2863 PyObject *obj = va_arg(*vargs, PyObject *);
2864 PyObject *ascii;
2865 assert(obj);
2866 ascii = PyObject_ASCII(obj);
2867 if (!ascii)
2868 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002869 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002870 Py_DECREF(ascii);
2871 return NULL;
2872 }
2873 Py_DECREF(ascii);
2874 break;
2875 }
2876
2877 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002878 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002879 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002880 break;
2881
2882 default:
2883 /* if we stumble upon an unknown formatting code, copy the rest
2884 of the format string to the output string. (we cannot just
2885 skip the code, since there's no way to know what's in the
2886 argument list) */
2887 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002888 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002889 return NULL;
2890 f = p+len;
2891 return f;
2892 }
2893
2894 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002895 return f;
2896}
2897
Walter Dörwaldd2034312007-05-18 16:29:38 +00002898PyObject *
2899PyUnicode_FromFormatV(const char *format, va_list vargs)
2900{
Victor Stinnere215d962012-10-06 23:03:36 +02002901 va_list vargs2;
2902 const char *f;
2903 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinner8f674cc2013-04-17 23:02:17 +02002905 _PyUnicodeWriter_Init(&writer);
2906 writer.min_length = strlen(format) + 100;
2907 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002908
Benjamin Peterson0c212142016-09-20 20:39:33 -07002909 // Copy varags to be able to pass a reference to a subfunction.
2910 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002911
2912 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002914 f = unicode_fromformat_arg(&writer, f, &vargs2);
2915 if (f == NULL)
2916 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002918 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002919 const char *p;
2920 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002921
Victor Stinnere215d962012-10-06 23:03:36 +02002922 p = f;
2923 do
2924 {
2925 if ((unsigned char)*p > 127) {
2926 PyErr_Format(PyExc_ValueError,
2927 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2928 "string, got a non-ASCII byte: 0x%02x",
2929 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002930 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
2932 p++;
2933 }
2934 while (*p != '\0' && *p != '%');
2935 len = p - f;
2936
2937 if (*p == '\0')
2938 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002939
2940 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002941 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002942
2943 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002944 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002946 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002947 return _PyUnicodeWriter_Finish(&writer);
2948
2949 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002950 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002951 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002953}
2954
Walter Dörwaldd2034312007-05-18 16:29:38 +00002955PyObject *
2956PyUnicode_FromFormat(const char *format, ...)
2957{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 PyObject* ret;
2959 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002960
2961#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002962 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002963#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002964 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002965#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002966 ret = PyUnicode_FromFormatV(format, vargs);
2967 va_end(vargs);
2968 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002969}
2970
Serhiy Storchakac46db922018-10-23 22:58:24 +03002971static Py_ssize_t
2972unicode_get_widechar_size(PyObject *unicode)
2973{
2974 Py_ssize_t res;
2975
2976 assert(unicode != NULL);
2977 assert(_PyUnicode_CHECK(unicode));
2978
2979 if (_PyUnicode_WSTR(unicode) != NULL) {
2980 return PyUnicode_WSTR_LENGTH(unicode);
2981 }
2982 assert(PyUnicode_IS_READY(unicode));
2983
2984 res = _PyUnicode_LENGTH(unicode);
2985#if SIZEOF_WCHAR_T == 2
2986 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2987 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2988 const Py_UCS4 *end = s + res;
2989 for (; s < end; ++s) {
2990 if (*s > 0xFFFF) {
2991 ++res;
2992 }
2993 }
2994 }
2995#endif
2996 return res;
2997}
2998
2999static void
3000unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3001{
3002 const wchar_t *wstr;
3003
3004 assert(unicode != NULL);
3005 assert(_PyUnicode_CHECK(unicode));
3006
3007 wstr = _PyUnicode_WSTR(unicode);
3008 if (wstr != NULL) {
3009 memcpy(w, wstr, size * sizeof(wchar_t));
3010 return;
3011 }
3012 assert(PyUnicode_IS_READY(unicode));
3013
3014 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3015 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3016 for (; size--; ++s, ++w) {
3017 *w = *s;
3018 }
3019 }
3020 else {
3021#if SIZEOF_WCHAR_T == 4
3022 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3023 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3024 for (; size--; ++s, ++w) {
3025 *w = *s;
3026 }
3027#else
3028 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3029 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3030 for (; size--; ++s, ++w) {
3031 Py_UCS4 ch = *s;
3032 if (ch > 0xFFFF) {
3033 assert(ch <= MAX_UNICODE);
3034 /* encode surrogate pair in this case */
3035 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3036 if (!size--)
3037 break;
3038 *w = Py_UNICODE_LOW_SURROGATE(ch);
3039 }
3040 else {
3041 *w = ch;
3042 }
3043 }
3044#endif
3045 }
3046}
3047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003048#ifdef HAVE_WCHAR_H
3049
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003050/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003051
Victor Stinnerd88d9832011-09-06 02:00:05 +02003052 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003053 character) required to convert the unicode object. Ignore size argument.
3054
Victor Stinnerd88d9832011-09-06 02:00:05 +02003055 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003056 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003057 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003058Py_ssize_t
3059PyUnicode_AsWideChar(PyObject *unicode,
3060 wchar_t *w,
3061 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003062{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003063 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003064
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003065 if (unicode == NULL) {
3066 PyErr_BadInternalCall();
3067 return -1;
3068 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003069 if (!PyUnicode_Check(unicode)) {
3070 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003071 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003072 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003073
3074 res = unicode_get_widechar_size(unicode);
3075 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003077 }
3078
3079 if (size > res) {
3080 size = res + 1;
3081 }
3082 else {
3083 res = size;
3084 }
3085 unicode_copy_as_widechar(unicode, w, size);
3086 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003087}
3088
Victor Stinner137c34c2010-09-29 10:25:54 +00003089wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003090PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003091 Py_ssize_t *size)
3092{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003093 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003094 Py_ssize_t buflen;
3095
3096 if (unicode == NULL) {
3097 PyErr_BadInternalCall();
3098 return NULL;
3099 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003102 return NULL;
3103 }
3104
Serhiy Storchakac46db922018-10-23 22:58:24 +03003105 buflen = unicode_get_widechar_size(unicode);
3106 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003107 if (buffer == NULL) {
3108 PyErr_NoMemory();
3109 return NULL;
3110 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003111 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3112 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003113 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114 }
3115 else if (wcslen(buffer) != (size_t)buflen) {
3116 PyMem_FREE(buffer);
3117 PyErr_SetString(PyExc_ValueError,
3118 "embedded null character");
3119 return NULL;
3120 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003121 return buffer;
3122}
3123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
Alexander Belopolsky40018472011-02-26 01:02:56 +00003126PyObject *
3127PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003128{
Victor Stinner8faf8212011-12-08 22:14:11 +01003129 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 PyErr_SetString(PyExc_ValueError,
3131 "chr() arg not in range(0x110000)");
3132 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003133 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003134
Victor Stinner985a82a2014-01-03 12:53:47 +01003135 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003136}
3137
Alexander Belopolsky40018472011-02-26 01:02:56 +00003138PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003139PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003141 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003143 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003144 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003145 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 Py_INCREF(obj);
3147 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003148 }
3149 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 /* For a Unicode subtype that's not a Unicode object,
3151 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003152 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003153 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003154 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003155 "Can't convert '%.100s' object to str implicitly",
3156 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003157 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003158}
3159
Alexander Belopolsky40018472011-02-26 01:02:56 +00003160PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003161PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003162 const char *encoding,
3163 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003164{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003165 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003167
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 PyErr_BadInternalCall();
3170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003172
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003173 /* Decoding bytes objects is the most common case and should be fast */
3174 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003175 if (PyBytes_GET_SIZE(obj) == 0)
3176 _Py_RETURN_UNICODE_EMPTY();
3177 v = PyUnicode_Decode(
3178 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3179 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003180 return v;
3181 }
3182
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003183 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 PyErr_SetString(PyExc_TypeError,
3185 "decoding str is not supported");
3186 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003187 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003188
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003189 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3190 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3191 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003192 "decoding to str: need a bytes-like object, %.80s found",
3193 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003194 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003195 }
Tim Petersced69f82003-09-16 20:30:58 +00003196
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003197 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003198 PyBuffer_Release(&buffer);
3199 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003201
Serhiy Storchaka05997252013-01-26 12:14:02 +02003202 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003203 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003204 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205}
3206
Victor Stinnerebe17e02016-10-12 13:57:45 +02003207/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3208 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3209 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003210int
3211_Py_normalize_encoding(const char *encoding,
3212 char *lower,
3213 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003215 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003216 char *l;
3217 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003218 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219
Victor Stinner942889a2016-09-05 15:40:10 -07003220 assert(encoding != NULL);
3221
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003222 e = encoding;
3223 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003224 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003225 punct = 0;
3226 while (1) {
3227 char c = *e;
3228 if (c == 0) {
3229 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003230 }
Victor Stinner942889a2016-09-05 15:40:10 -07003231
3232 if (Py_ISALNUM(c) || c == '.') {
3233 if (punct && l != lower) {
3234 if (l == l_end) {
3235 return 0;
3236 }
3237 *l++ = '_';
3238 }
3239 punct = 0;
3240
3241 if (l == l_end) {
3242 return 0;
3243 }
3244 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003245 }
3246 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003247 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003248 }
Victor Stinner942889a2016-09-05 15:40:10 -07003249
3250 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003251 }
3252 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003253 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 Py_ssize_t size,
3259 const char *encoding,
3260 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003261{
3262 PyObject *buffer = NULL, *unicode;
3263 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003264 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3265
3266 if (encoding == NULL) {
3267 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3268 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003269
Fred Drakee4315f52000-05-09 19:53:39 +00003270 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003271 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3272 char *lower = buflower;
3273
3274 /* Fast paths */
3275 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3276 lower += 3;
3277 if (*lower == '_') {
3278 /* Match "utf8" and "utf_8" */
3279 lower++;
3280 }
3281
3282 if (lower[0] == '8' && lower[1] == 0) {
3283 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3284 }
3285 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3286 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3287 }
3288 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3289 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3290 }
3291 }
3292 else {
3293 if (strcmp(lower, "ascii") == 0
3294 || strcmp(lower, "us_ascii") == 0) {
3295 return PyUnicode_DecodeASCII(s, size, errors);
3296 }
Steve Dowercc16be82016-09-08 10:35:16 -07003297 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003298 else if (strcmp(lower, "mbcs") == 0) {
3299 return PyUnicode_DecodeMBCS(s, size, errors);
3300 }
3301 #endif
3302 else if (strcmp(lower, "latin1") == 0
3303 || strcmp(lower, "latin_1") == 0
3304 || strcmp(lower, "iso_8859_1") == 0
3305 || strcmp(lower, "iso8859_1") == 0) {
3306 return PyUnicode_DecodeLatin1(s, size, errors);
3307 }
3308 }
Victor Stinner37296e82010-06-10 13:36:23 +00003309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
3311 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003312 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003313 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003314 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003315 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (buffer == NULL)
3317 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003318 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 if (unicode == NULL)
3320 goto onError;
3321 if (!PyUnicode_Check(unicode)) {
3322 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003323 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003324 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003325 encoding,
3326 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 Py_DECREF(unicode);
3328 goto onError;
3329 }
3330 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003331 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003332
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 Py_XDECREF(buffer);
3335 return NULL;
3336}
3337
Alexander Belopolsky40018472011-02-26 01:02:56 +00003338PyObject *
3339PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003340 const char *encoding,
3341 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003343 if (!PyUnicode_Check(unicode)) {
3344 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003345 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 }
3347
Serhiy Storchaka00939072016-10-27 21:05:49 +03003348 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3349 "PyUnicode_AsDecodedObject() is deprecated; "
3350 "use PyCodec_Decode() to decode from str", 1) < 0)
3351 return NULL;
3352
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003353 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003355
3356 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003357 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003358}
3359
Alexander Belopolsky40018472011-02-26 01:02:56 +00003360PyObject *
3361PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003362 const char *encoding,
3363 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003364{
3365 PyObject *v;
3366
3367 if (!PyUnicode_Check(unicode)) {
3368 PyErr_BadArgument();
3369 goto onError;
3370 }
3371
Serhiy Storchaka00939072016-10-27 21:05:49 +03003372 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3373 "PyUnicode_AsDecodedUnicode() is deprecated; "
3374 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3375 return NULL;
3376
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003377 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379
3380 /* Decode via the codec registry */
3381 v = PyCodec_Decode(unicode, encoding, errors);
3382 if (v == NULL)
3383 goto onError;
3384 if (!PyUnicode_Check(v)) {
3385 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003386 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003387 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003388 encoding,
3389 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003390 Py_DECREF(v);
3391 goto onError;
3392 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003393 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003396 return NULL;
3397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 Py_ssize_t size,
3402 const char *encoding,
3403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404{
3405 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003406
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003407 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3411 Py_DECREF(unicode);
3412 return v;
3413}
3414
Alexander Belopolsky40018472011-02-26 01:02:56 +00003415PyObject *
3416PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003417 const char *encoding,
3418 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003419{
3420 PyObject *v;
3421
3422 if (!PyUnicode_Check(unicode)) {
3423 PyErr_BadArgument();
3424 goto onError;
3425 }
3426
Serhiy Storchaka00939072016-10-27 21:05:49 +03003427 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3428 "PyUnicode_AsEncodedObject() is deprecated; "
3429 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3430 "or PyCodec_Encode() for generic encoding", 1) < 0)
3431 return NULL;
3432
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003433 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003435
3436 /* Encode via the codec registry */
3437 v = PyCodec_Encode(unicode, encoding, errors);
3438 if (v == NULL)
3439 goto onError;
3440 return v;
3441
Benjamin Peterson29060642009-01-31 22:14:21 +00003442 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003443 return NULL;
3444}
3445
Victor Stinner1b579672011-12-17 05:47:23 +01003446
Victor Stinner2cba6b82018-01-10 22:46:15 +01003447static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003448unicode_encode_locale(PyObject *unicode, const char *errors,
3449 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003451 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003453 Py_ssize_t wlen;
3454 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3455 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003457 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003458
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003459 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003460 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003461 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 return NULL;
3463 }
3464
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003465 char *str;
3466 size_t error_pos;
3467 const char *reason;
3468 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003469 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003470 PyMem_Free(wstr);
3471
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003472 if (res != 0) {
3473 if (res == -2) {
3474 PyObject *exc;
3475 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3476 "locale", unicode,
3477 (Py_ssize_t)error_pos,
3478 (Py_ssize_t)(error_pos+1),
3479 reason);
3480 if (exc != NULL) {
3481 PyCodec_StrictErrors(exc);
3482 Py_DECREF(exc);
3483 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003485 else if (res == -3) {
3486 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3487 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003488 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003489 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003490 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003491 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003494 PyObject *bytes = PyBytes_FromString(str);
3495 PyMem_RawFree(str);
3496 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497}
3498
Victor Stinnerad158722010-10-27 00:25:46 +00003499PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003500PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3501{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003502 return unicode_encode_locale(unicode, errors, 1);
3503}
3504
3505PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003506PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003507{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003508 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003509 const _PyCoreConfig *config = &interp->core_config;
Victor Stinnere2510952019-05-02 11:28:57 -04003510#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003511 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3512#else
Victor Stinner793b5312011-04-27 00:24:21 +02003513 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3514 cannot use it to encode and decode filenames before it is loaded. Load
3515 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003516 implementation of the locale codec until the codec registry is
3517 initialized and the Python codec is loaded. See initfsencoding(). */
3518 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003519 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003520 config->filesystem_encoding,
3521 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003522 }
3523 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003524 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003525 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003526 }
Victor Stinnerad158722010-10-27 00:25:46 +00003527#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003528}
3529
Alexander Belopolsky40018472011-02-26 01:02:56 +00003530PyObject *
3531PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003532 const char *encoding,
3533 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534{
3535 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003536 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003537
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 if (!PyUnicode_Check(unicode)) {
3539 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 }
Fred Drakee4315f52000-05-09 19:53:39 +00003542
Victor Stinner942889a2016-09-05 15:40:10 -07003543 if (encoding == NULL) {
3544 return _PyUnicode_AsUTF8String(unicode, errors);
3545 }
3546
Fred Drakee4315f52000-05-09 19:53:39 +00003547 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003548 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3549 char *lower = buflower;
3550
3551 /* Fast paths */
3552 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3553 lower += 3;
3554 if (*lower == '_') {
3555 /* Match "utf8" and "utf_8" */
3556 lower++;
3557 }
3558
3559 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003561 }
3562 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3563 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3564 }
3565 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3566 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3567 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003568 }
Victor Stinner942889a2016-09-05 15:40:10 -07003569 else {
3570 if (strcmp(lower, "ascii") == 0
3571 || strcmp(lower, "us_ascii") == 0) {
3572 return _PyUnicode_AsASCIIString(unicode, errors);
3573 }
Steve Dowercc16be82016-09-08 10:35:16 -07003574#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003575 else if (strcmp(lower, "mbcs") == 0) {
3576 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3577 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003578#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003579 else if (strcmp(lower, "latin1") == 0 ||
3580 strcmp(lower, "latin_1") == 0 ||
3581 strcmp(lower, "iso_8859_1") == 0 ||
3582 strcmp(lower, "iso8859_1") == 0) {
3583 return _PyUnicode_AsLatin1String(unicode, errors);
3584 }
3585 }
Victor Stinner37296e82010-06-10 13:36:23 +00003586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587
3588 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003589 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003591 return NULL;
3592
3593 /* The normal path */
3594 if (PyBytes_Check(v))
3595 return v;
3596
3597 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003598 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003599 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003600 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003601
3602 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003603 "encoder %s returned bytearray instead of bytes; "
3604 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003605 encoding);
3606 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003607 Py_DECREF(v);
3608 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003609 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003610
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003611 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3612 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 Py_DECREF(v);
3614 return b;
3615 }
3616
3617 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003618 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003619 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003620 encoding,
3621 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003622 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 return NULL;
3624}
3625
Alexander Belopolsky40018472011-02-26 01:02:56 +00003626PyObject *
3627PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003628 const char *encoding,
3629 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630{
3631 PyObject *v;
3632
3633 if (!PyUnicode_Check(unicode)) {
3634 PyErr_BadArgument();
3635 goto onError;
3636 }
3637
Serhiy Storchaka00939072016-10-27 21:05:49 +03003638 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3639 "PyUnicode_AsEncodedUnicode() is deprecated; "
3640 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3641 return NULL;
3642
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003643 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645
3646 /* Encode via the codec registry */
3647 v = PyCodec_Encode(unicode, encoding, errors);
3648 if (v == NULL)
3649 goto onError;
3650 if (!PyUnicode_Check(v)) {
3651 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003652 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003654 encoding,
3655 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003656 Py_DECREF(v);
3657 goto onError;
3658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003660
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return NULL;
3663}
3664
Victor Stinner2cba6b82018-01-10 22:46:15 +01003665static PyObject*
3666unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3667 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003668{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003669 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003670
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003671 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3672 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003673 return NULL;
3674 }
3675
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003676 wchar_t *wstr;
3677 size_t wlen;
3678 const char *reason;
3679 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003680 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003681 if (res != 0) {
3682 if (res == -2) {
3683 PyObject *exc;
3684 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3685 "locale", str, len,
3686 (Py_ssize_t)wlen,
3687 (Py_ssize_t)(wlen + 1),
3688 reason);
3689 if (exc != NULL) {
3690 PyCodec_StrictErrors(exc);
3691 Py_DECREF(exc);
3692 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003693 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003694 else if (res == -3) {
3695 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3696 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003697 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003698 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003699 }
Victor Stinner2f197072011-12-17 07:08:30 +01003700 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003701 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003702
3703 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3704 PyMem_RawFree(wstr);
3705 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003706}
3707
3708PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003709PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3710 const char *errors)
3711{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003712 return unicode_decode_locale(str, len, errors, 1);
3713}
3714
3715PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003716PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
3718 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003719 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720}
3721
3722
3723PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003724PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003725 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003726 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3727}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003728
Christian Heimes5894ba72007-11-04 11:43:14 +00003729PyObject*
3730PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3731{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003732 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003733 const _PyCoreConfig *config = &interp->core_config;
Victor Stinnere2510952019-05-02 11:28:57 -04003734#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003735 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3736#else
Victor Stinner793b5312011-04-27 00:24:21 +02003737 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3738 cannot use it to encode and decode filenames before it is loaded. Load
3739 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003740 implementation of the locale codec until the codec registry is
3741 initialized and the Python codec is loaded. See initfsencoding(). */
3742 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003743 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003744 config->filesystem_encoding,
3745 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003746 }
3747 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003748 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003749 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003750 }
Victor Stinnerad158722010-10-27 00:25:46 +00003751#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003752}
3753
Martin v. Löwis011e8422009-05-05 04:43:17 +00003754
3755int
3756PyUnicode_FSConverter(PyObject* arg, void* addr)
3757{
Brett Cannonec6ce872016-09-06 15:50:29 -07003758 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003759 PyObject *output = NULL;
3760 Py_ssize_t size;
3761 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003762 if (arg == NULL) {
3763 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003764 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003765 return 1;
3766 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003767 path = PyOS_FSPath(arg);
3768 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003769 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003771 if (PyBytes_Check(path)) {
3772 output = path;
3773 }
3774 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3775 output = PyUnicode_EncodeFSDefault(path);
3776 Py_DECREF(path);
3777 if (!output) {
3778 return 0;
3779 }
3780 assert(PyBytes_Check(output));
3781 }
3782
Victor Stinner0ea2a462010-04-30 00:22:08 +00003783 size = PyBytes_GET_SIZE(output);
3784 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003785 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003786 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003787 Py_DECREF(output);
3788 return 0;
3789 }
3790 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003791 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003792}
3793
3794
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003795int
3796PyUnicode_FSDecoder(PyObject* arg, void* addr)
3797{
Brett Cannona5711202016-09-06 19:36:01 -07003798 int is_buffer = 0;
3799 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003800 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003801 if (arg == NULL) {
3802 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003803 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003804 return 1;
3805 }
Brett Cannona5711202016-09-06 19:36:01 -07003806
3807 is_buffer = PyObject_CheckBuffer(arg);
3808 if (!is_buffer) {
3809 path = PyOS_FSPath(arg);
3810 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003811 return 0;
3812 }
Brett Cannona5711202016-09-06 19:36:01 -07003813 }
3814 else {
3815 path = arg;
3816 Py_INCREF(arg);
3817 }
3818
3819 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003820 output = path;
3821 }
3822 else if (PyBytes_Check(path) || is_buffer) {
3823 PyObject *path_bytes = NULL;
3824
3825 if (!PyBytes_Check(path) &&
3826 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003827 "path should be string, bytes, or os.PathLike, not %.200s",
3828 Py_TYPE(arg)->tp_name)) {
3829 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003830 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003831 }
3832 path_bytes = PyBytes_FromObject(path);
3833 Py_DECREF(path);
3834 if (!path_bytes) {
3835 return 0;
3836 }
3837 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3838 PyBytes_GET_SIZE(path_bytes));
3839 Py_DECREF(path_bytes);
3840 if (!output) {
3841 return 0;
3842 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003843 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003844 else {
3845 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003846 "path should be string, bytes, or os.PathLike, not %.200s",
3847 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003848 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003849 return 0;
3850 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003851 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003852 Py_DECREF(output);
3853 return 0;
3854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003856 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003857 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003858 Py_DECREF(output);
3859 return 0;
3860 }
3861 *(PyObject**)addr = output;
3862 return Py_CLEANUP_SUPPORTED;
3863}
3864
3865
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003866const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003868{
Christian Heimesf3863112007-11-22 07:46:41 +00003869 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003871 if (!PyUnicode_Check(unicode)) {
3872 PyErr_BadArgument();
3873 return NULL;
3874 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003875 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003876 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003878 if (PyUnicode_UTF8(unicode) == NULL) {
3879 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003880 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 if (bytes == NULL)
3882 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003883 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3884 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003885 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 Py_DECREF(bytes);
3887 return NULL;
3888 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003890 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 PyBytes_AS_STRING(bytes),
3892 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 Py_DECREF(bytes);
3894 }
3895
3896 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003897 *psize = PyUnicode_UTF8_LENGTH(unicode);
3898 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003899}
3900
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003901const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3905}
3906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907Py_UNICODE *
3908PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910 if (!PyUnicode_Check(unicode)) {
3911 PyErr_BadArgument();
3912 return NULL;
3913 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003914 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3915 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003917 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003918 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919
Serhiy Storchakac46db922018-10-23 22:58:24 +03003920 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3921 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3922 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003925 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3926 if (w == NULL) {
3927 PyErr_NoMemory();
3928 return NULL;
3929 }
3930 unicode_copy_as_widechar(unicode, w, wlen + 1);
3931 _PyUnicode_WSTR(unicode) = w;
3932 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3933 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 }
3935 }
3936 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003938 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003939}
3940
Alexander Belopolsky40018472011-02-26 01:02:56 +00003941Py_UNICODE *
3942PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945}
3946
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003947const Py_UNICODE *
3948_PyUnicode_AsUnicode(PyObject *unicode)
3949{
3950 Py_ssize_t size;
3951 const Py_UNICODE *wstr;
3952
3953 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3954 if (wstr && wcslen(wstr) != (size_t)size) {
3955 PyErr_SetString(PyExc_ValueError, "embedded null character");
3956 return NULL;
3957 }
3958 return wstr;
3959}
3960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961
Alexander Belopolsky40018472011-02-26 01:02:56 +00003962Py_ssize_t
3963PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964{
3965 if (!PyUnicode_Check(unicode)) {
3966 PyErr_BadArgument();
3967 goto onError;
3968 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003969 if (_PyUnicode_WSTR(unicode) == NULL) {
3970 if (PyUnicode_AsUnicode(unicode) == NULL)
3971 goto onError;
3972 }
3973 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 return -1;
3977}
3978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979Py_ssize_t
3980PyUnicode_GetLength(PyObject *unicode)
3981{
Victor Stinner07621332012-06-16 04:53:46 +02003982 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 PyErr_BadArgument();
3984 return -1;
3985 }
Victor Stinner07621332012-06-16 04:53:46 +02003986 if (PyUnicode_READY(unicode) == -1)
3987 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 return PyUnicode_GET_LENGTH(unicode);
3989}
3990
3991Py_UCS4
3992PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3993{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003994 void *data;
3995 int kind;
3996
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003997 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003998 PyErr_BadArgument();
3999 return (Py_UCS4)-1;
4000 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004001 if (PyUnicode_READY(unicode) == -1) {
4002 return (Py_UCS4)-1;
4003 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004004 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004005 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return (Py_UCS4)-1;
4007 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004008 data = PyUnicode_DATA(unicode);
4009 kind = PyUnicode_KIND(unicode);
4010 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011}
4012
4013int
4014PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4015{
4016 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004017 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return -1;
4019 }
Victor Stinner488fa492011-12-12 00:01:39 +01004020 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004021 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004022 PyErr_SetString(PyExc_IndexError, "string index out of range");
4023 return -1;
4024 }
Victor Stinner488fa492011-12-12 00:01:39 +01004025 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004026 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004027 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4028 PyErr_SetString(PyExc_ValueError, "character out of range");
4029 return -1;
4030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4032 index, ch);
4033 return 0;
4034}
4035
Alexander Belopolsky40018472011-02-26 01:02:56 +00004036const char *
4037PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004038{
Victor Stinner42cb4622010-09-01 19:39:01 +00004039 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004040}
4041
Victor Stinner554f3f02010-06-16 23:33:54 +00004042/* create or adjust a UnicodeDecodeError */
4043static void
4044make_decode_exception(PyObject **exceptionObject,
4045 const char *encoding,
4046 const char *input, Py_ssize_t length,
4047 Py_ssize_t startpos, Py_ssize_t endpos,
4048 const char *reason)
4049{
4050 if (*exceptionObject == NULL) {
4051 *exceptionObject = PyUnicodeDecodeError_Create(
4052 encoding, input, length, startpos, endpos, reason);
4053 }
4054 else {
4055 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4056 goto onError;
4057 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4058 goto onError;
4059 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4060 goto onError;
4061 }
4062 return;
4063
4064onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004065 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004066}
4067
Steve Dowercc16be82016-09-08 10:35:16 -07004068#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004069static int
4070widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4071{
4072 if (newsize > *size) {
4073 wchar_t *newbuf = *buf;
4074 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4075 PyErr_NoMemory();
4076 return -1;
4077 }
4078 *buf = newbuf;
4079 }
4080 *size = newsize;
4081 return 0;
4082}
4083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084/* error handling callback helper:
4085 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004086 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 and adjust various state variables.
4088 return 0 on success, -1 on error
4089*/
4090
Alexander Belopolsky40018472011-02-26 01:02:56 +00004091static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004092unicode_decode_call_errorhandler_wchar(
4093 const char *errors, PyObject **errorHandler,
4094 const char *encoding, const char *reason,
4095 const char **input, const char **inend, Py_ssize_t *startinpos,
4096 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004097 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004099 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100
4101 PyObject *restuple = NULL;
4102 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004103 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004104 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004105 Py_ssize_t requiredsize;
4106 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004108 wchar_t *repwstr;
4109 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110
4111 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 *errorHandler = PyCodec_LookupError(errors);
4113 if (*errorHandler == NULL)
4114 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 }
4116
Victor Stinner554f3f02010-06-16 23:33:54 +00004117 make_decode_exception(exceptionObject,
4118 encoding,
4119 *input, *inend - *input,
4120 *startinpos, *endinpos,
4121 reason);
4122 if (*exceptionObject == NULL)
4123 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004125 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004129 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004132 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134
4135 /* Copy back the bytes variables, which might have been modified by the
4136 callback */
4137 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4138 if (!inputobj)
4139 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004140 *input = PyBytes_AS_STRING(inputobj);
4141 insize = PyBytes_GET_SIZE(inputobj);
4142 *inend = *input + insize;
4143 /* we can DECREF safely, as the exception has another reference,
4144 so the object won't go away. */
4145 Py_DECREF(inputobj);
4146
4147 if (newpos<0)
4148 newpos = insize+newpos;
4149 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004150 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004151 goto onError;
4152 }
4153
4154 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4155 if (repwstr == NULL)
4156 goto onError;
4157 /* need more space? (at least enough for what we
4158 have+the replacement+the rest of the string (starting
4159 at the new input position), so we won't have to check space
4160 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004161 requiredsize = *outpos;
4162 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4163 goto overflow;
4164 requiredsize += repwlen;
4165 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4166 goto overflow;
4167 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004168 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004169 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004170 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004171 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004172 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004174 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004175 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004176 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004178 *endinpos = newpos;
4179 *inptr = *input + newpos;
4180
4181 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004182 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004183 return 0;
4184
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004185 overflow:
4186 PyErr_SetString(PyExc_OverflowError,
4187 "decoded result is too long for a Python string");
4188
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004189 onError:
4190 Py_XDECREF(restuple);
4191 return -1;
4192}
Steve Dowercc16be82016-09-08 10:35:16 -07004193#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004194
4195static int
4196unicode_decode_call_errorhandler_writer(
4197 const char *errors, PyObject **errorHandler,
4198 const char *encoding, const char *reason,
4199 const char **input, const char **inend, Py_ssize_t *startinpos,
4200 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4201 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4202{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004203 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204
4205 PyObject *restuple = NULL;
4206 PyObject *repunicode = NULL;
4207 Py_ssize_t insize;
4208 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004209 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004210 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004211 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004212 int need_to_grow = 0;
4213 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004214
4215 if (*errorHandler == NULL) {
4216 *errorHandler = PyCodec_LookupError(errors);
4217 if (*errorHandler == NULL)
4218 goto onError;
4219 }
4220
4221 make_decode_exception(exceptionObject,
4222 encoding,
4223 *input, *inend - *input,
4224 *startinpos, *endinpos,
4225 reason);
4226 if (*exceptionObject == NULL)
4227 goto onError;
4228
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004229 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004230 if (restuple == NULL)
4231 goto onError;
4232 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004233 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 goto onError;
4235 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004236 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004237 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004238
4239 /* Copy back the bytes variables, which might have been modified by the
4240 callback */
4241 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4242 if (!inputobj)
4243 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004244 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004245 *input = PyBytes_AS_STRING(inputobj);
4246 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004247 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004248 /* we can DECREF safely, as the exception has another reference,
4249 so the object won't go away. */
4250 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004254 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004255 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004257 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258
Victor Stinner170ca6f2013-04-18 00:25:28 +02004259 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004260 if (replen > 1) {
4261 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004262 need_to_grow = 1;
4263 }
4264 new_inptr = *input + newpos;
4265 if (*inend - new_inptr > remain) {
4266 /* We don't know the decoding algorithm here so we make the worst
4267 assumption that one byte decodes to one unicode character.
4268 If unfortunately one byte could decode to more unicode characters,
4269 the decoder may write out-of-bound then. Is it possible for the
4270 algorithms using this function? */
4271 writer->min_length += *inend - new_inptr - remain;
4272 need_to_grow = 1;
4273 }
4274 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004275 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004276 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004277 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4278 goto onError;
4279 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004281 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004284 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004287 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293}
4294
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295/* --- UTF-7 Codec -------------------------------------------------------- */
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297/* See RFC2152 for details. We encode conservatively and decode liberally. */
4298
4299/* Three simple macros defining base-64. */
4300
4301/* Is c a base-64 character? */
4302
4303#define IS_BASE64(c) \
4304 (((c) >= 'A' && (c) <= 'Z') || \
4305 ((c) >= 'a' && (c) <= 'z') || \
4306 ((c) >= '0' && (c) <= '9') || \
4307 (c) == '+' || (c) == '/')
4308
4309/* given that c is a base-64 character, what is its base-64 value? */
4310
4311#define FROM_BASE64(c) \
4312 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4313 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4314 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4315 (c) == '+' ? 62 : 63)
4316
4317/* What is the base-64 character of the bottom 6 bits of n? */
4318
4319#define TO_BASE64(n) \
4320 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4321
4322/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4323 * decoded as itself. We are permissive on decoding; the only ASCII
4324 * byte not decoding to itself is the + which begins a base64
4325 * string. */
4326
4327#define DECODE_DIRECT(c) \
4328 ((c) <= 127 && (c) != '+')
4329
4330/* The UTF-7 encoder treats ASCII characters differently according to
4331 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4332 * the above). See RFC2152. This array identifies these different
4333 * sets:
4334 * 0 : "Set D"
4335 * alphanumeric and '(),-./:?
4336 * 1 : "Set O"
4337 * !"#$%&*;<=>@[]^_`{|}
4338 * 2 : "whitespace"
4339 * ht nl cr sp
4340 * 3 : special (must be base64 encoded)
4341 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4342 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343
Tim Petersced69f82003-09-16 20:30:58 +00004344static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345char utf7_category[128] = {
4346/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4347 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4348/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4349 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4350/* sp ! " # $ % & ' ( ) * + , - . / */
4351 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4352/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4354/* @ A B C D E F G H I J K L M N O */
4355 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4356/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4358/* ` a b c d e f g h i j k l m n o */
4359 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4360/* p q r s t u v w x y z { | } ~ del */
4361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362};
4363
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364/* ENCODE_DIRECT: this character should be encoded as itself. The
4365 * answer depends on whether we are encoding set O as itself, and also
4366 * on whether we are encoding whitespace as itself. RFC2152 makes it
4367 * clear that the answers to these questions vary between
4368 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004369
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370#define ENCODE_DIRECT(c, directO, directWS) \
4371 ((c) < 128 && (c) > 0 && \
4372 ((utf7_category[(c)] == 0) || \
4373 (directWS && (utf7_category[(c)] == 2)) || \
4374 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375
Alexander Belopolsky40018472011-02-26 01:02:56 +00004376PyObject *
4377PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004378 Py_ssize_t size,
4379 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004381 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4382}
4383
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384/* The decoder. The only state we preserve is our read position,
4385 * i.e. how many characters we have consumed. So if we end in the
4386 * middle of a shift sequence we have to back off the read position
4387 * and the output to the beginning of the sequence, otherwise we lose
4388 * all the shift state (seen bits, number of bits seen, high
4389 * surrogate). */
4390
Alexander Belopolsky40018472011-02-26 01:02:56 +00004391PyObject *
4392PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004393 Py_ssize_t size,
4394 const char *errors,
4395 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004396{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t startinpos;
4399 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402 const char *errmsg = "";
4403 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004404 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 unsigned int base64bits = 0;
4406 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004407 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 PyObject *errorHandler = NULL;
4409 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004411 if (size == 0) {
4412 if (consumed)
4413 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004414 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004417 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004418 _PyUnicodeWriter_Init(&writer);
4419 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420
4421 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 e = s + size;
4423
4424 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004425 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004427 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 if (inShift) { /* in a base-64 section */
4430 if (IS_BASE64(ch)) { /* consume a base-64 character */
4431 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4432 base64bits += 6;
4433 s++;
4434 if (base64bits >= 16) {
4435 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004436 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437 base64bits -= 16;
4438 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004439 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 if (surrogate) {
4441 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004442 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4443 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004444 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004447 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 }
4449 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004450 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004451 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 }
4454 }
Victor Stinner551ac952011-11-29 22:58:13 +01004455 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 /* first surrogate */
4457 surrogate = outCh;
4458 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004460 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004461 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 }
4463 }
4464 }
4465 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 if (base64bits > 0) { /* left-over bits */
4468 if (base64bits >= 6) {
4469 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004470 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 errmsg = "partial character in shift sequence";
4472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 else {
4475 /* Some bits remain; they should be zero */
4476 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004477 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 errmsg = "non-zero padding bits in shift sequence";
4479 goto utf7Error;
4480 }
4481 }
4482 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004483 if (surrogate && DECODE_DIRECT(ch)) {
4484 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4485 goto onError;
4486 }
4487 surrogate = 0;
4488 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 /* '-' is absorbed; other terminating
4490 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004491 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 }
4494 }
4495 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 s++; /* consume '+' */
4498 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004500 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004501 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004503 else if (s < e && !IS_BASE64(*s)) {
4504 s++;
4505 errmsg = "ill-formed sequence";
4506 goto utf7Error;
4507 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004510 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004511 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004513 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 }
4515 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004518 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004519 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else {
4522 startinpos = s-starts;
4523 s++;
4524 errmsg = "unexpected special character";
4525 goto utf7Error;
4526 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004530 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 errors, &errorHandler,
4532 "utf7", errmsg,
4533 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 }
4537
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 /* end of string */
4539
4540 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4541 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004542 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (surrogate ||
4544 (base64bits >= 6) ||
4545 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004547 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 errors, &errorHandler,
4549 "utf7", "unterminated shift sequence",
4550 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004551 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 goto onError;
4553 if (s < e)
4554 goto restart;
4555 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557
4558 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004559 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004561 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004562 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004563 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004564 writer.kind, writer.data, shiftOutStart);
4565 Py_XDECREF(errorHandler);
4566 Py_XDECREF(exc);
4567 _PyUnicodeWriter_Dealloc(&writer);
4568 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004569 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004570 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 }
4572 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004573 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 Py_XDECREF(errorHandler);
4578 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004579 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 Py_XDECREF(errorHandler);
4583 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004584 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585 return NULL;
4586}
4587
4588
Alexander Belopolsky40018472011-02-26 01:02:56 +00004589PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004590_PyUnicode_EncodeUTF7(PyObject *str,
4591 int base64SetO,
4592 int base64WhiteSpace,
4593 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595 int kind;
4596 void *data;
4597 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004598 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004600 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 unsigned int base64bits = 0;
4602 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603 char * out;
4604 char * start;
4605
Benjamin Petersonbac79492012-01-14 13:34:47 -05004606 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607 return NULL;
4608 kind = PyUnicode_KIND(str);
4609 data = PyUnicode_DATA(str);
4610 len = PyUnicode_GET_LENGTH(str);
4611
4612 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004615 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004616 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004617 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004618 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 if (v == NULL)
4620 return NULL;
4621
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004622 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004623 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004624 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 if (inShift) {
4627 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4628 /* shifting out */
4629 if (base64bits) { /* output remaining bits */
4630 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4631 base64buffer = 0;
4632 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
4634 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 /* Characters not in the BASE64 set implicitly unshift the sequence
4636 so no '-' is required, except if the character is itself a '-' */
4637 if (IS_BASE64(ch) || ch == '-') {
4638 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 *out++ = (char) ch;
4641 }
4642 else {
4643 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 else { /* not in a shift sequence */
4647 if (ch == '+') {
4648 *out++ = '+';
4649 *out++ = '-';
4650 }
4651 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4652 *out++ = (char) ch;
4653 }
4654 else {
4655 *out++ = '+';
4656 inShift = 1;
4657 goto encode_char;
4658 }
4659 }
4660 continue;
4661encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004663 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004664
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 /* code first surrogate */
4666 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004667 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 while (base64bits >= 6) {
4669 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4670 base64bits -= 6;
4671 }
4672 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004673 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 base64bits += 16;
4676 base64buffer = (base64buffer << 16) | ch;
4677 while (base64bits >= 6) {
4678 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4679 base64bits -= 6;
4680 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 if (base64bits)
4683 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4684 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004686 if (_PyBytes_Resize(&v, out - start) < 0)
4687 return NULL;
4688 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004690PyObject *
4691PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4692 Py_ssize_t size,
4693 int base64SetO,
4694 int base64WhiteSpace,
4695 const char *errors)
4696{
4697 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004698 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004699 if (tmp == NULL)
4700 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004701 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004702 base64WhiteSpace, errors);
4703 Py_DECREF(tmp);
4704 return result;
4705}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707#undef IS_BASE64
4708#undef FROM_BASE64
4709#undef TO_BASE64
4710#undef DECODE_DIRECT
4711#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713/* --- UTF-8 Codec -------------------------------------------------------- */
4714
Alexander Belopolsky40018472011-02-26 01:02:56 +00004715PyObject *
4716PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004717 Py_ssize_t size,
4718 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719{
Walter Dörwald69652032004-09-07 20:24:22 +00004720 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4721}
4722
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723#include "stringlib/asciilib.h"
4724#include "stringlib/codecs.h"
4725#include "stringlib/undef.h"
4726
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004727#include "stringlib/ucs1lib.h"
4728#include "stringlib/codecs.h"
4729#include "stringlib/undef.h"
4730
4731#include "stringlib/ucs2lib.h"
4732#include "stringlib/codecs.h"
4733#include "stringlib/undef.h"
4734
4735#include "stringlib/ucs4lib.h"
4736#include "stringlib/codecs.h"
4737#include "stringlib/undef.h"
4738
Antoine Pitrouab868312009-01-10 15:40:25 +00004739/* Mask to quickly check whether a C 'long' contains a
4740 non-ASCII, UTF8-encoded char. */
4741#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004742# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004743#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004744# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004745#else
4746# error C 'long' size should be either 4 or 8!
4747#endif
4748
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749static Py_ssize_t
4750ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004753 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004754
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004755 /*
4756 * Issue #17237: m68k is a bit different from most architectures in
4757 * that objects do not use "natural alignment" - for example, int and
4758 * long are only aligned at 2-byte boundaries. Therefore the assert()
4759 * won't work; also, tests have shown that skipping the "optimised
4760 * version" will even speed up m68k.
4761 */
4762#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004764 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4765 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 /* Fast path, see in STRINGLIB(utf8_decode) for
4767 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004768 /* Help allocation */
4769 const char *_p = p;
4770 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 while (_p < aligned_end) {
4772 unsigned long value = *(const unsigned long *) _p;
4773 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 *((unsigned long *)q) = value;
4776 _p += SIZEOF_LONG;
4777 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004778 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 p = _p;
4780 while (p < end) {
4781 if ((unsigned char)*p & 0x80)
4782 break;
4783 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004788#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 while (p < end) {
4790 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4791 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004792 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004793 /* Help allocation */
4794 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004795 while (_p < aligned_end) {
4796 unsigned long value = *(unsigned long *) _p;
4797 if (value & ASCII_CHAR_MASK)
4798 break;
4799 _p += SIZEOF_LONG;
4800 }
4801 p = _p;
4802 if (_p == end)
4803 break;
4804 }
4805 if ((unsigned char)*p & 0x80)
4806 break;
4807 ++p;
4808 }
4809 memcpy(dest, start, p - start);
4810 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811}
Antoine Pitrouab868312009-01-10 15:40:25 +00004812
Victor Stinner785938e2011-12-11 20:09:03 +01004813PyObject *
4814PyUnicode_DecodeUTF8Stateful(const char *s,
4815 Py_ssize_t size,
4816 const char *errors,
4817 Py_ssize_t *consumed)
4818{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004819 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004820 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822
4823 Py_ssize_t startinpos;
4824 Py_ssize_t endinpos;
4825 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004826 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004828 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004829
4830 if (size == 0) {
4831 if (consumed)
4832 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004833 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004834 }
4835
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4837 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004838 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 *consumed = 1;
4840 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004841 }
4842
Victor Stinner8f674cc2013-04-17 23:02:17 +02004843 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004844 writer.min_length = size;
4845 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004846 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004847
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004848 writer.pos = ascii_decode(s, end, writer.data);
4849 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 while (s < end) {
4851 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004852 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004853
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004855 if (PyUnicode_IS_ASCII(writer.buffer))
4856 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004858 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004860 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004861 } else {
4862 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004863 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 }
4865
4866 switch (ch) {
4867 case 0:
4868 if (s == end || consumed)
4869 goto End;
4870 errmsg = "unexpected end of data";
4871 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004872 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 break;
4874 case 1:
4875 errmsg = "invalid start byte";
4876 startinpos = s - starts;
4877 endinpos = startinpos + 1;
4878 break;
4879 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004880 case 3:
4881 case 4:
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004882 if (s == end || consumed) {
4883 goto End;
4884 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 errmsg = "invalid continuation byte";
4886 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004887 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 break;
4889 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004890 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 goto onError;
4892 continue;
4893 }
4894
Victor Stinner1d65d912015-10-05 13:43:50 +02004895 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004896 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004897
4898 switch (error_handler) {
4899 case _Py_ERROR_IGNORE:
4900 s += (endinpos - startinpos);
4901 break;
4902
4903 case _Py_ERROR_REPLACE:
4904 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4905 goto onError;
4906 s += (endinpos - startinpos);
4907 break;
4908
4909 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004910 {
4911 Py_ssize_t i;
4912
Victor Stinner1d65d912015-10-05 13:43:50 +02004913 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4914 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004915 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004916 ch = (Py_UCS4)(unsigned char)(starts[i]);
4917 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4918 ch + 0xdc00);
4919 writer.pos++;
4920 }
4921 s += (endinpos - startinpos);
4922 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004923 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004924
4925 default:
4926 if (unicode_decode_call_errorhandler_writer(
4927 errors, &error_handler_obj,
4928 "utf-8", errmsg,
4929 &starts, &end, &startinpos, &endinpos, &exc, &s,
4930 &writer))
4931 goto onError;
4932 }
Victor Stinner785938e2011-12-11 20:09:03 +01004933 }
4934
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 if (consumed)
4937 *consumed = s - starts;
4938
Victor Stinner1d65d912015-10-05 13:43:50 +02004939 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004941 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942
4943onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004944 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004948}
4949
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004950
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004951/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4952 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 On success, write a pointer to a newly allocated wide character string into
4955 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4956 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004957
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004958 On memory allocation failure, return -1.
4959
4960 On decoding error (if surrogateescape is zero), return -2. If wlen is
4961 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4962 is not NULL, write the decoding error message into *reason. */
4963int
4964_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004965 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004966{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004967 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004968 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004969 wchar_t *unicode;
4970 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004971
Victor Stinner3d4226a2018-08-29 22:21:32 +02004972 int surrogateescape = 0;
4973 int surrogatepass = 0;
4974 switch (errors)
4975 {
4976 case _Py_ERROR_STRICT:
4977 break;
4978 case _Py_ERROR_SURROGATEESCAPE:
4979 surrogateescape = 1;
4980 break;
4981 case _Py_ERROR_SURROGATEPASS:
4982 surrogatepass = 1;
4983 break;
4984 default:
4985 return -3;
4986 }
4987
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 /* Note: size will always be longer than the resulting Unicode
4989 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004990 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004991 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004992 }
4993
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004994 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004995 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004996 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004997 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998
4999 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005006#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005008#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 if (ch > 0xFF) {
5010#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005011 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005013 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005014 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5016 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5017#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005018 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005020 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005022 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005023
5024 if (surrogateescape) {
5025 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5026 }
5027 else {
5028 /* Is it a valid three-byte code? */
5029 if (surrogatepass
5030 && (e - s) >= 3
5031 && (s[0] & 0xf0) == 0xe0
5032 && (s[1] & 0xc0) == 0x80
5033 && (s[2] & 0xc0) == 0x80)
5034 {
5035 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5036 s += 3;
5037 unicode[outpos++] = ch;
5038 }
5039 else {
5040 PyMem_RawFree(unicode );
5041 if (reason != NULL) {
5042 switch (ch) {
5043 case 0:
5044 *reason = "unexpected end of data";
5045 break;
5046 case 1:
5047 *reason = "invalid start byte";
5048 break;
5049 /* 2, 3, 4 */
5050 default:
5051 *reason = "invalid continuation byte";
5052 break;
5053 }
5054 }
5055 if (wlen != NULL) {
5056 *wlen = s - orig_s;
5057 }
5058 return -2;
5059 }
5060 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005061 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005062 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005063 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005064 if (wlen) {
5065 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005066 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005067 *wstr = unicode;
5068 return 0;
5069}
5070
Victor Stinner5f9cf232019-03-19 01:46:25 +01005071
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005072wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005073_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5074 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005075{
5076 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005077 int res = _Py_DecodeUTF8Ex(arg, arglen,
5078 &wstr, wlen,
5079 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005080 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005081 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5082 assert(res != -3);
5083 if (wlen) {
5084 *wlen = (size_t)res;
5085 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005086 return NULL;
5087 }
5088 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089}
5090
Antoine Pitrouab868312009-01-10 15:40:25 +00005091
Victor Stinnere47e6982017-12-21 15:45:16 +01005092/* UTF-8 encoder using the surrogateescape error handler .
5093
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005094 On success, return 0 and write the newly allocated character string (use
5095 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005096
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005097 On encoding failure, return -2 and write the position of the invalid
5098 surrogate character into *error_pos (if error_pos is set) and the decoding
5099 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005100
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005101 On memory allocation failure, return -1. */
5102int
5103_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005104 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005105{
5106 const Py_ssize_t max_char_size = 4;
5107 Py_ssize_t len = wcslen(text);
5108
5109 assert(len >= 0);
5110
Victor Stinner3d4226a2018-08-29 22:21:32 +02005111 int surrogateescape = 0;
5112 int surrogatepass = 0;
5113 switch (errors)
5114 {
5115 case _Py_ERROR_STRICT:
5116 break;
5117 case _Py_ERROR_SURROGATEESCAPE:
5118 surrogateescape = 1;
5119 break;
5120 case _Py_ERROR_SURROGATEPASS:
5121 surrogatepass = 1;
5122 break;
5123 default:
5124 return -3;
5125 }
5126
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005127 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5128 return -1;
5129 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005130 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005131 if (raw_malloc) {
5132 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005133 }
5134 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005136 }
5137 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005139 }
5140
5141 char *p = bytes;
5142 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005143 for (i = 0; i < len; ) {
5144 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005146 i++;
5147#if Py_UNICODE_SIZE == 2
5148 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5149 && i < len
5150 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5151 {
5152 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5153 i++;
5154 }
5155#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005156
5157 if (ch < 0x80) {
5158 /* Encode ASCII */
5159 *p++ = (char) ch;
5160
5161 }
5162 else if (ch < 0x0800) {
5163 /* Encode Latin-1 */
5164 *p++ = (char)(0xc0 | (ch >> 6));
5165 *p++ = (char)(0x80 | (ch & 0x3f));
5166 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005167 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005168 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005169 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005170 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005171 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005172 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005173 if (reason != NULL) {
5174 *reason = "encoding error";
5175 }
5176 if (raw_malloc) {
5177 PyMem_RawFree(bytes);
5178 }
5179 else {
5180 PyMem_Free(bytes);
5181 }
5182 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005183 }
5184 *p++ = (char)(ch & 0xff);
5185 }
5186 else if (ch < 0x10000) {
5187 *p++ = (char)(0xe0 | (ch >> 12));
5188 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5189 *p++ = (char)(0x80 | (ch & 0x3f));
5190 }
5191 else { /* ch >= 0x10000 */
5192 assert(ch <= MAX_UNICODE);
5193 /* Encode UCS4 Unicode ordinals */
5194 *p++ = (char)(0xf0 | (ch >> 18));
5195 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5196 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5197 *p++ = (char)(0x80 | (ch & 0x3f));
5198 }
5199 }
5200 *p++ = '\0';
5201
5202 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005203 char *bytes2;
5204 if (raw_malloc) {
5205 bytes2 = PyMem_RawRealloc(bytes, final_size);
5206 }
5207 else {
5208 bytes2 = PyMem_Realloc(bytes, final_size);
5209 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005210 if (bytes2 == NULL) {
5211 if (error_pos != NULL) {
5212 *error_pos = (size_t)-1;
5213 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005214 if (raw_malloc) {
5215 PyMem_RawFree(bytes);
5216 }
5217 else {
5218 PyMem_Free(bytes);
5219 }
5220 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005221 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005222 *str = bytes2;
5223 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005224}
5225
5226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005227/* Primary internal function which creates utf8 encoded bytes objects.
5228
5229 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005230 and allocate exactly as much space needed at the end. Else allocate the
5231 maximum possible needed (4 result bytes per Unicode character), and return
5232 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005233*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005234PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005235_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Victor Stinner6099a032011-12-18 14:22:26 +01005237 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005238 void *data;
5239 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005241 if (!PyUnicode_Check(unicode)) {
5242 PyErr_BadArgument();
5243 return NULL;
5244 }
5245
5246 if (PyUnicode_READY(unicode) == -1)
5247 return NULL;
5248
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005249 if (PyUnicode_UTF8(unicode))
5250 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5251 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005252
5253 kind = PyUnicode_KIND(unicode);
5254 data = PyUnicode_DATA(unicode);
5255 size = PyUnicode_GET_LENGTH(unicode);
5256
Benjamin Petersonead6b532011-12-20 17:23:42 -06005257 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005258 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005259 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005260 case PyUnicode_1BYTE_KIND:
5261 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5262 assert(!PyUnicode_IS_ASCII(unicode));
5263 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5264 case PyUnicode_2BYTE_KIND:
5265 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5266 case PyUnicode_4BYTE_KIND:
5267 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269}
5270
Alexander Belopolsky40018472011-02-26 01:02:56 +00005271PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005272PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5273 Py_ssize_t size,
5274 const char *errors)
5275{
5276 PyObject *v, *unicode;
5277
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005278 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005279 if (unicode == NULL)
5280 return NULL;
5281 v = _PyUnicode_AsUTF8String(unicode, errors);
5282 Py_DECREF(unicode);
5283 return v;
5284}
5285
5286PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005287PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005289 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290}
5291
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292/* --- UTF-32 Codec ------------------------------------------------------- */
5293
5294PyObject *
5295PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 Py_ssize_t size,
5297 const char *errors,
5298 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005299{
5300 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5301}
5302
5303PyObject *
5304PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 Py_ssize_t size,
5306 const char *errors,
5307 int *byteorder,
5308 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005309{
5310 const char *starts = s;
5311 Py_ssize_t startinpos;
5312 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005313 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005314 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005318 PyObject *errorHandler = NULL;
5319 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005320
Walter Dörwald41980ca2007-08-16 21:55:45 +00005321 q = (unsigned char *)s;
5322 e = q + size;
5323
5324 if (byteorder)
5325 bo = *byteorder;
5326
5327 /* Check for BOM marks (U+FEFF) in the input and adjust current
5328 byte order setting accordingly. In native mode, the leading BOM
5329 mark is skipped, in all other modes, it is copied to the output
5330 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005332 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 if (bom == 0x0000FEFF) {
5334 bo = -1;
5335 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005337 else if (bom == 0xFFFE0000) {
5338 bo = 1;
5339 q += 4;
5340 }
5341 if (byteorder)
5342 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343 }
5344
Victor Stinnere64322e2012-10-30 23:12:47 +01005345 if (q == e) {
5346 if (consumed)
5347 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005348 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 }
5350
Victor Stinnere64322e2012-10-30 23:12:47 +01005351#ifdef WORDS_BIGENDIAN
5352 le = bo < 0;
5353#else
5354 le = bo <= 0;
5355#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005356 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005357
Victor Stinner8f674cc2013-04-17 23:02:17 +02005358 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005359 writer.min_length = (e - q + 3) / 4;
5360 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005362
Victor Stinnere64322e2012-10-30 23:12:47 +01005363 while (1) {
5364 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005366
Victor Stinnere64322e2012-10-30 23:12:47 +01005367 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 enum PyUnicode_Kind kind = writer.kind;
5369 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005370 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005371 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005372 if (le) {
5373 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005374 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005375 if (ch > maxch)
5376 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005377 if (kind != PyUnicode_1BYTE_KIND &&
5378 Py_UNICODE_IS_SURROGATE(ch))
5379 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005380 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005381 q += 4;
5382 } while (q <= last);
5383 }
5384 else {
5385 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005386 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005387 if (ch > maxch)
5388 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 if (kind != PyUnicode_1BYTE_KIND &&
5390 Py_UNICODE_IS_SURROGATE(ch))
5391 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005392 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005393 q += 4;
5394 } while (q <= last);
5395 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005396 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005397 }
5398
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005400 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 startinpos = ((const char *)q) - starts;
5402 endinpos = startinpos + 4;
5403 }
5404 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005405 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005407 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005409 startinpos = ((const char *)q) - starts;
5410 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005412 else {
5413 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005414 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005415 goto onError;
5416 q += 4;
5417 continue;
5418 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005419 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005420 startinpos = ((const char *)q) - starts;
5421 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005423
5424 /* The remaining input chars are ignored if the callback
5425 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005430 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005432 }
5433
Walter Dörwald41980ca2007-08-16 21:55:45 +00005434 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005436
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437 Py_XDECREF(errorHandler);
5438 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005439 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005440
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005442 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005443 Py_XDECREF(errorHandler);
5444 Py_XDECREF(exc);
5445 return NULL;
5446}
5447
5448PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005449_PyUnicode_EncodeUTF32(PyObject *str,
5450 const char *errors,
5451 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005452{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005453 enum PyUnicode_Kind kind;
5454 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005456 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005457 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005458#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005460#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005462#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005464 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 PyObject *errorHandler = NULL;
5466 PyObject *exc = NULL;
5467 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005468
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469 if (!PyUnicode_Check(str)) {
5470 PyErr_BadArgument();
5471 return NULL;
5472 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005473 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005474 return NULL;
5475 kind = PyUnicode_KIND(str);
5476 data = PyUnicode_DATA(str);
5477 len = PyUnicode_GET_LENGTH(str);
5478
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005480 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005481 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005482 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005483 if (v == NULL)
5484 return NULL;
5485
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 /* output buffer is 4-bytes aligned */
5487 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005488 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005489 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005490 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005491 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005492 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005493
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 else
5499 encoding = "utf-32";
5500
5501 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5503 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005504 }
5505
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005506 pos = 0;
5507 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005509
5510 if (kind == PyUnicode_2BYTE_KIND) {
5511 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5512 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005513 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005514 else {
5515 assert(kind == PyUnicode_4BYTE_KIND);
5516 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5517 &out, native_ordering);
5518 }
5519 if (pos == len)
5520 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005521
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005522 rep = unicode_encode_call_errorhandler(
5523 errors, &errorHandler,
5524 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005525 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005526 if (!rep)
5527 goto error;
5528
5529 if (PyBytes_Check(rep)) {
5530 repsize = PyBytes_GET_SIZE(rep);
5531 if (repsize & 3) {
5532 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005533 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005534 "surrogates not allowed");
5535 goto error;
5536 }
5537 moreunits = repsize / 4;
5538 }
5539 else {
5540 assert(PyUnicode_Check(rep));
5541 if (PyUnicode_READY(rep) < 0)
5542 goto error;
5543 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5544 if (!PyUnicode_IS_ASCII(rep)) {
5545 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005546 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005547 "surrogates not allowed");
5548 goto error;
5549 }
5550 }
5551
5552 /* four bytes are reserved for each surrogate */
5553 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005554 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005555 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 /* integer overflow */
5557 PyErr_NoMemory();
5558 goto error;
5559 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005560 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005562 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005563 }
5564
5565 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005566 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005567 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005569 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005570 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5571 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005572 }
5573
5574 Py_CLEAR(rep);
5575 }
5576
5577 /* Cut back to size actually needed. This is necessary for, for example,
5578 encoding of a string containing isolated surrogates and the 'ignore'
5579 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005581 if (nsize != PyBytes_GET_SIZE(v))
5582 _PyBytes_Resize(&v, nsize);
5583 Py_XDECREF(errorHandler);
5584 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005585 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005586 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005587 error:
5588 Py_XDECREF(rep);
5589 Py_XDECREF(errorHandler);
5590 Py_XDECREF(exc);
5591 Py_XDECREF(v);
5592 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005593}
5594
Alexander Belopolsky40018472011-02-26 01:02:56 +00005595PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005596PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5597 Py_ssize_t size,
5598 const char *errors,
5599 int byteorder)
5600{
5601 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005602 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603 if (tmp == NULL)
5604 return NULL;
5605 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5606 Py_DECREF(tmp);
5607 return result;
5608}
5609
5610PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005611PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005612{
Victor Stinnerb960b342011-11-20 19:12:52 +01005613 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614}
5615
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616/* --- UTF-16 Codec ------------------------------------------------------- */
5617
Tim Peters772747b2001-08-09 22:21:55 +00005618PyObject *
5619PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 Py_ssize_t size,
5621 const char *errors,
5622 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
Walter Dörwald69652032004-09-07 20:24:22 +00005624 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5625}
5626
5627PyObject *
5628PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 Py_ssize_t size,
5630 const char *errors,
5631 int *byteorder,
5632 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t startinpos;
5636 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005639 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005640 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005641 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 PyObject *errorHandler = NULL;
5643 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005644 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
Tim Peters772747b2001-08-09 22:21:55 +00005646 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
5649 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005650 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005652 /* Check for BOM marks (U+FEFF) in the input and adjust current
5653 byte order setting accordingly. In native mode, the leading BOM
5654 mark is skipped, in all other modes, it is copied to the output
5655 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005656 if (bo == 0 && size >= 2) {
5657 const Py_UCS4 bom = (q[1] << 8) | q[0];
5658 if (bom == 0xFEFF) {
5659 q += 2;
5660 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005662 else if (bom == 0xFFFE) {
5663 q += 2;
5664 bo = 1;
5665 }
5666 if (byteorder)
5667 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
Antoine Pitrou63065d72012-05-15 23:48:04 +02005670 if (q == e) {
5671 if (consumed)
5672 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005673 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005674 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005675
Christian Heimes743e0cd2012-10-17 23:52:17 +02005676#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005677 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005678 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005679#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005680 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005681 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005682#endif
Tim Peters772747b2001-08-09 22:21:55 +00005683
Antoine Pitrou63065d72012-05-15 23:48:04 +02005684 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005685 character count normally. Error handler will take care of
5686 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005687 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005688 writer.min_length = (e - q + 1) / 2;
5689 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005691
Antoine Pitrou63065d72012-05-15 23:48:04 +02005692 while (1) {
5693 Py_UCS4 ch = 0;
5694 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005695 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005696 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005698 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005700 native_ordering);
5701 else
5702 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005704 native_ordering);
5705 } else if (kind == PyUnicode_2BYTE_KIND) {
5706 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005708 native_ordering);
5709 } else {
5710 assert(kind == PyUnicode_4BYTE_KIND);
5711 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005712 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005713 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005714 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716
Antoine Pitrou63065d72012-05-15 23:48:04 +02005717 switch (ch)
5718 {
5719 case 0:
5720 /* remaining byte at the end? (size should be even) */
5721 if (q == e || consumed)
5722 goto End;
5723 errmsg = "truncated data";
5724 startinpos = ((const char *)q) - starts;
5725 endinpos = ((const char *)e) - starts;
5726 break;
5727 /* The remaining input chars are ignored if the callback
5728 chooses to skip the input */
5729 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005730 q -= 2;
5731 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005732 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005733 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005734 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005735 endinpos = ((const char *)e) - starts;
5736 break;
5737 case 2:
5738 errmsg = "illegal encoding";
5739 startinpos = ((const char *)q) - 2 - starts;
5740 endinpos = startinpos + 2;
5741 break;
5742 case 3:
5743 errmsg = "illegal UTF-16 surrogate";
5744 startinpos = ((const char *)q) - 4 - starts;
5745 endinpos = startinpos + 2;
5746 break;
5747 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005748 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005749 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 continue;
5751 }
5752
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005753 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005754 errors,
5755 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005757 &starts,
5758 (const char **)&e,
5759 &startinpos,
5760 &endinpos,
5761 &exc,
5762 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005763 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 }
5766
Antoine Pitrou63065d72012-05-15 23:48:04 +02005767End:
Walter Dörwald69652032004-09-07 20:24:22 +00005768 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 Py_XDECREF(errorHandler);
5772 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005773 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005776 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 Py_XDECREF(errorHandler);
5778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780}
5781
Tim Peters772747b2001-08-09 22:21:55 +00005782PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005783_PyUnicode_EncodeUTF16(PyObject *str,
5784 const char *errors,
5785 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005787 enum PyUnicode_Kind kind;
5788 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005791 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005792 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005793#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005794 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005795#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005796 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005797#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005798 const char *encoding;
5799 Py_ssize_t nsize, pos;
5800 PyObject *errorHandler = NULL;
5801 PyObject *exc = NULL;
5802 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005803
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005804 if (!PyUnicode_Check(str)) {
5805 PyErr_BadArgument();
5806 return NULL;
5807 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005808 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005809 return NULL;
5810 kind = PyUnicode_KIND(str);
5811 data = PyUnicode_DATA(str);
5812 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005813
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005814 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005815 if (kind == PyUnicode_4BYTE_KIND) {
5816 const Py_UCS4 *in = (const Py_UCS4 *)data;
5817 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005818 while (in < end) {
5819 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005820 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005821 }
5822 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005823 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005824 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005826 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005827 nsize = len + pairs + (byteorder == 0);
5828 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005829 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005833 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005834 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005835 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005836 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005837 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005838 }
5839 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005840 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005841 }
Tim Peters772747b2001-08-09 22:21:55 +00005842
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 if (kind == PyUnicode_1BYTE_KIND) {
5844 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5845 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005846 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005847
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005848 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005850 }
5851 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005852 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005853 }
5854 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005856 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005857
5858 pos = 0;
5859 while (pos < len) {
5860 Py_ssize_t repsize, moreunits;
5861
5862 if (kind == PyUnicode_2BYTE_KIND) {
5863 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5864 &out, native_ordering);
5865 }
5866 else {
5867 assert(kind == PyUnicode_4BYTE_KIND);
5868 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5869 &out, native_ordering);
5870 }
5871 if (pos == len)
5872 break;
5873
5874 rep = unicode_encode_call_errorhandler(
5875 errors, &errorHandler,
5876 encoding, "surrogates not allowed",
5877 str, &exc, pos, pos + 1, &pos);
5878 if (!rep)
5879 goto error;
5880
5881 if (PyBytes_Check(rep)) {
5882 repsize = PyBytes_GET_SIZE(rep);
5883 if (repsize & 1) {
5884 raise_encode_exception(&exc, encoding,
5885 str, pos - 1, pos,
5886 "surrogates not allowed");
5887 goto error;
5888 }
5889 moreunits = repsize / 2;
5890 }
5891 else {
5892 assert(PyUnicode_Check(rep));
5893 if (PyUnicode_READY(rep) < 0)
5894 goto error;
5895 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5896 if (!PyUnicode_IS_ASCII(rep)) {
5897 raise_encode_exception(&exc, encoding,
5898 str, pos - 1, pos,
5899 "surrogates not allowed");
5900 goto error;
5901 }
5902 }
5903
5904 /* two bytes are reserved for each surrogate */
5905 if (moreunits > 1) {
5906 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005907 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005908 /* integer overflow */
5909 PyErr_NoMemory();
5910 goto error;
5911 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005912 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005913 goto error;
5914 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5915 }
5916
5917 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005918 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 out += moreunits;
5920 } else /* rep is unicode */ {
5921 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5922 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5923 &out, native_ordering);
5924 }
5925
5926 Py_CLEAR(rep);
5927 }
5928
5929 /* Cut back to size actually needed. This is necessary for, for example,
5930 encoding of a string containing isolated surrogates and the 'ignore' handler
5931 is used. */
5932 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5933 if (nsize != PyBytes_GET_SIZE(v))
5934 _PyBytes_Resize(&v, nsize);
5935 Py_XDECREF(errorHandler);
5936 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005937 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939 error:
5940 Py_XDECREF(rep);
5941 Py_XDECREF(errorHandler);
5942 Py_XDECREF(exc);
5943 Py_XDECREF(v);
5944 return NULL;
5945#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5950 Py_ssize_t size,
5951 const char *errors,
5952 int byteorder)
5953{
5954 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005955 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 if (tmp == NULL)
5957 return NULL;
5958 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5959 Py_DECREF(tmp);
5960 return result;
5961}
5962
5963PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005964PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967}
5968
5969/* --- Unicode Escape Codec ----------------------------------------------- */
5970
Fredrik Lundh06d12682001-01-24 07:59:11 +00005971static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005972
Alexander Belopolsky40018472011-02-26 01:02:56 +00005973PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005974_PyUnicode_DecodeUnicodeEscape(const char *s,
5975 Py_ssize_t size,
5976 const char *errors,
5977 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005980 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982 PyObject *errorHandler = NULL;
5983 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005984
Eric V. Smith42454af2016-10-31 09:22:08 -04005985 // so we can remember if we've seen an invalid escape char or not
5986 *first_invalid_escape = NULL;
5987
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005989 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 }
5991 /* Escaped strings will always be longer than the resulting
5992 Unicode string, so we start with size here and then reduce the
5993 length after conversion to the true value.
5994 (but if the error callback returns a long replacement string
5995 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005996 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005997 writer.min_length = size;
5998 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5999 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006000 }
6001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 end = s + size;
6003 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006004 unsigned char c = (unsigned char) *s++;
6005 Py_UCS4 ch;
6006 int count;
6007 Py_ssize_t startinpos;
6008 Py_ssize_t endinpos;
6009 const char *message;
6010
6011#define WRITE_ASCII_CHAR(ch) \
6012 do { \
6013 assert(ch <= 127); \
6014 assert(writer.pos < writer.size); \
6015 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6016 } while(0)
6017
6018#define WRITE_CHAR(ch) \
6019 do { \
6020 if (ch <= writer.maxchar) { \
6021 assert(writer.pos < writer.size); \
6022 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6023 } \
6024 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6025 goto onError; \
6026 } \
6027 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006030 if (c != '\\') {
6031 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 continue;
6033 }
6034
Victor Stinner62ec3312016-09-06 17:04:34 -07006035 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 if (s >= end) {
6038 message = "\\ at end of string";
6039 goto error;
6040 }
6041 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006042
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006044 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 case '\n': continue;
6048 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6049 case '\'': WRITE_ASCII_CHAR('\''); continue;
6050 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6051 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006053 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6054 case 't': WRITE_ASCII_CHAR('\t'); continue;
6055 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6056 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006057 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006058 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 case '0': case '1': case '2': case '3':
6064 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006066 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006067 ch = (ch<<3) + *s++ - '0';
6068 if (s < end && '0' <= *s && *s <= '7') {
6069 ch = (ch<<3) + *s++ - '0';
6070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 WRITE_CHAR(ch);
6073 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 /* hex escapes */
6076 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 message = "truncated \\xXX escape";
6080 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006084 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 message = "truncated \\uXXXX escape";
6086 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006089 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 message = "truncated \\UXXXXXXXX escape";
6092 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006093 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006094 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006095 ch <<= 4;
6096 if (c >= '0' && c <= '9') {
6097 ch += c - '0';
6098 }
6099 else if (c >= 'a' && c <= 'f') {
6100 ch += c - ('a' - 10);
6101 }
6102 else if (c >= 'A' && c <= 'F') {
6103 ch += c - ('A' - 10);
6104 }
6105 else {
6106 break;
6107 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006108 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006109 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006110 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 }
6112
6113 /* when we get here, ch is a 32-bit unicode character */
6114 if (ch > MAX_UNICODE) {
6115 message = "illegal Unicode character";
6116 goto error;
6117 }
6118
6119 WRITE_CHAR(ch);
6120 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006121
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006123 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006124 if (ucnhash_CAPI == NULL) {
6125 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006126 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6127 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006128 if (ucnhash_CAPI == NULL) {
6129 PyErr_SetString(
6130 PyExc_UnicodeError,
6131 "\\N escapes not supported (can't load unicodedata module)"
6132 );
6133 goto onError;
6134 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006135 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006136
6137 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006138 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006139 const char *start = ++s;
6140 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006141 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006143 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 namelen = s - start;
6145 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006146 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006147 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 ch = 0xffffffff; /* in case 'getcode' messes up */
6149 if (namelen <= INT_MAX &&
6150 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6151 &ch, 0)) {
6152 assert(ch <= MAX_UNICODE);
6153 WRITE_CHAR(ch);
6154 continue;
6155 }
6156 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006157 }
6158 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006159 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006160
6161 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006162 if (*first_invalid_escape == NULL) {
6163 *first_invalid_escape = s-1; /* Back up one char, since we've
6164 already incremented s. */
6165 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 WRITE_ASCII_CHAR('\\');
6167 WRITE_CHAR(c);
6168 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006170
6171 error:
6172 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006174 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006175 errors, &errorHandler,
6176 "unicodeescape", message,
6177 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006178 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006179 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006181 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182
6183#undef WRITE_ASCII_CHAR
6184#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006186
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006187 Py_XDECREF(errorHandler);
6188 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006189 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006192 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006193 Py_XDECREF(errorHandler);
6194 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 return NULL;
6196}
6197
Eric V. Smith42454af2016-10-31 09:22:08 -04006198PyObject *
6199PyUnicode_DecodeUnicodeEscape(const char *s,
6200 Py_ssize_t size,
6201 const char *errors)
6202{
6203 const char *first_invalid_escape;
6204 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6205 &first_invalid_escape);
6206 if (result == NULL)
6207 return NULL;
6208 if (first_invalid_escape != NULL) {
6209 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6210 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006211 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006212 Py_DECREF(result);
6213 return NULL;
6214 }
6215 }
6216 return result;
6217}
6218
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006219/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006222PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230
Ezio Melottie7f90372012-10-05 03:33:31 +03006231 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006232 escape.
6233
Ezio Melottie7f90372012-10-05 03:33:31 +03006234 For UCS1 strings it's '\xxx', 4 bytes per source character.
6235 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6236 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006237 */
6238
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006239 if (!PyUnicode_Check(unicode)) {
6240 PyErr_BadArgument();
6241 return NULL;
6242 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006244 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 }
Victor Stinner358af132015-10-12 22:36:57 +02006246
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006247 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 if (len == 0) {
6249 return PyBytes_FromStringAndSize(NULL, 0);
6250 }
6251
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006252 kind = PyUnicode_KIND(unicode);
6253 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6255 bytes, and 1 byte characters 4. */
6256 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006257 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 return PyErr_NoMemory();
6259 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006260 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 if (repr == NULL) {
6262 return NULL;
6263 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006264
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006266 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006267 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006268
Victor Stinner62ec3312016-09-06 17:04:34 -07006269 /* U+0000-U+00ff range */
6270 if (ch < 0x100) {
6271 if (ch >= ' ' && ch < 127) {
6272 if (ch != '\\') {
6273 /* Copy printable US ASCII as-is */
6274 *p++ = (char) ch;
6275 }
6276 /* Escape backslashes */
6277 else {
6278 *p++ = '\\';
6279 *p++ = '\\';
6280 }
6281 }
Victor Stinner358af132015-10-12 22:36:57 +02006282
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 /* Map special whitespace to '\t', \n', '\r' */
6284 else if (ch == '\t') {
6285 *p++ = '\\';
6286 *p++ = 't';
6287 }
6288 else if (ch == '\n') {
6289 *p++ = '\\';
6290 *p++ = 'n';
6291 }
6292 else if (ch == '\r') {
6293 *p++ = '\\';
6294 *p++ = 'r';
6295 }
6296
6297 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6298 else {
6299 *p++ = '\\';
6300 *p++ = 'x';
6301 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6302 *p++ = Py_hexdigits[ch & 0x000F];
6303 }
Tim Petersced69f82003-09-16 20:30:58 +00006304 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006305 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 *p++ = '\\';
6308 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006309 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6310 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6311 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6312 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6315 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006316
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 /* Make sure that the first two digits are zero */
6318 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006319 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 *p++ = 'U';
6321 *p++ = '0';
6322 *p++ = '0';
6323 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6324 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6325 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6326 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6327 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6328 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 assert(p - PyBytes_AS_STRING(repr) > 0);
6333 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6334 return NULL;
6335 }
6336 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6341 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006343 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006344 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 }
6348
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006349 result = PyUnicode_AsUnicodeEscapeString(tmp);
6350 Py_DECREF(tmp);
6351 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352}
6353
6354/* --- Raw Unicode Escape Codec ------------------------------------------- */
6355
Alexander Belopolsky40018472011-02-26 01:02:56 +00006356PyObject *
6357PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006358 Py_ssize_t size,
6359 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006362 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 PyObject *errorHandler = NULL;
6365 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006366
Victor Stinner62ec3312016-09-06 17:04:34 -07006367 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006368 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 /* Escaped strings will always be longer than the resulting
6372 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373 length after conversion to the true value. (But decoding error
6374 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006375 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006376 writer.min_length = size;
6377 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6378 goto onError;
6379 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 end = s + size;
6382 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 unsigned char c = (unsigned char) *s++;
6384 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006385 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 Py_ssize_t startinpos;
6387 Py_ssize_t endinpos;
6388 const char *message;
6389
6390#define WRITE_CHAR(ch) \
6391 do { \
6392 if (ch <= writer.maxchar) { \
6393 assert(writer.pos < writer.size); \
6394 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6395 } \
6396 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6397 goto onError; \
6398 } \
6399 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 if (c != '\\' || s >= end) {
6403 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006406
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 c = (unsigned char) *s++;
6408 if (c == 'u') {
6409 count = 4;
6410 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 else if (c == 'U') {
6413 count = 8;
6414 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006415 }
6416 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 assert(writer.pos < writer.size);
6418 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6419 WRITE_CHAR(c);
6420 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006421 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 startinpos = s - starts - 2;
6423
6424 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6425 for (ch = 0; count && s < end; ++s, --count) {
6426 c = (unsigned char)*s;
6427 ch <<= 4;
6428 if (c >= '0' && c <= '9') {
6429 ch += c - '0';
6430 }
6431 else if (c >= 'a' && c <= 'f') {
6432 ch += c - ('a' - 10);
6433 }
6434 else if (c >= 'A' && c <= 'F') {
6435 ch += c - ('A' - 10);
6436 }
6437 else {
6438 break;
6439 }
6440 }
6441 if (!count) {
6442 if (ch <= MAX_UNICODE) {
6443 WRITE_CHAR(ch);
6444 continue;
6445 }
6446 message = "\\Uxxxxxxxx out of range";
6447 }
6448
6449 endinpos = s-starts;
6450 writer.min_length = end - s + writer.pos;
6451 if (unicode_decode_call_errorhandler_writer(
6452 errors, &errorHandler,
6453 "rawunicodeescape", message,
6454 &starts, &end, &startinpos, &endinpos, &exc, &s,
6455 &writer)) {
6456 goto onError;
6457 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006458 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006459
6460#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 Py_XDECREF(errorHandler);
6463 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006464 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006465
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006467 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 Py_XDECREF(errorHandler);
6469 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006474
Alexander Belopolsky40018472011-02-26 01:02:56 +00006475PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006481 int kind;
6482 void *data;
6483 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485 if (!PyUnicode_Check(unicode)) {
6486 PyErr_BadArgument();
6487 return NULL;
6488 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 kind = PyUnicode_KIND(unicode);
6493 data = PyUnicode_DATA(unicode);
6494 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 if (kind == PyUnicode_1BYTE_KIND) {
6496 return PyBytes_FromStringAndSize(data, len);
6497 }
Victor Stinner0e368262011-11-10 20:12:49 +01006498
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6500 bytes, and 1 byte characters 4. */
6501 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502
Victor Stinner62ec3312016-09-06 17:04:34 -07006503 if (len > PY_SSIZE_T_MAX / expandsize) {
6504 return PyErr_NoMemory();
6505 }
6506 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6507 if (repr == NULL) {
6508 return NULL;
6509 }
6510 if (len == 0) {
6511 return repr;
6512 }
6513
6514 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006515 for (pos = 0; pos < len; pos++) {
6516 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006517
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6519 if (ch < 0x100) {
6520 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006521 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006522 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 *p++ = '\\';
6525 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006526 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6527 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6528 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6529 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6532 else {
6533 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6534 *p++ = '\\';
6535 *p++ = 'U';
6536 *p++ = '0';
6537 *p++ = '0';
6538 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6539 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6540 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6541 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6542 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6543 *p++ = Py_hexdigits[ch & 15];
6544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006546
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 assert(p > PyBytes_AS_STRING(repr));
6548 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6549 return NULL;
6550 }
6551 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Alexander Belopolsky40018472011-02-26 01:02:56 +00006554PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006555PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6556 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006558 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006559 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006560 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006561 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006562 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6563 Py_DECREF(tmp);
6564 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565}
6566
6567/* --- Latin-1 Codec ------------------------------------------------------ */
6568
Alexander Belopolsky40018472011-02-26 01:02:56 +00006569PyObject *
6570PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006571 Py_ssize_t size,
6572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006575 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576}
6577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006579static void
6580make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006581 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006582 PyObject *unicode,
6583 Py_ssize_t startpos, Py_ssize_t endpos,
6584 const char *reason)
6585{
6586 if (*exceptionObject == NULL) {
6587 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006589 encoding, unicode, startpos, endpos, reason);
6590 }
6591 else {
6592 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6593 goto onError;
6594 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6595 goto onError;
6596 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6597 goto onError;
6598 return;
6599 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006600 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006601 }
6602}
6603
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605static void
6606raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006608 PyObject *unicode,
6609 Py_ssize_t startpos, Py_ssize_t endpos,
6610 const char *reason)
6611{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006613 encoding, unicode, startpos, endpos, reason);
6614 if (*exceptionObject != NULL)
6615 PyCodec_StrictErrors(*exceptionObject);
6616}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006617
6618/* error handling callback helper:
6619 build arguments, call the callback and check the arguments,
6620 put the result into newpos and return the replacement string, which
6621 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006622static PyObject *
6623unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006624 PyObject **errorHandler,
6625 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006626 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006630 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006631 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 PyObject *restuple;
6633 PyObject *resunicode;
6634
6635 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 }
6640
Benjamin Petersonbac79492012-01-14 13:34:47 -05006641 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 return NULL;
6643 len = PyUnicode_GET_LENGTH(unicode);
6644
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006645 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006650 restuple = PyObject_CallFunctionObjArgs(
6651 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006655 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 Py_DECREF(restuple);
6657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006659 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 &resunicode, newpos)) {
6661 Py_DECREF(restuple);
6662 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006664 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6665 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6666 Py_DECREF(restuple);
6667 return NULL;
6668 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 *newpos = len + *newpos;
6671 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006672 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 Py_DECREF(restuple);
6674 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 Py_INCREF(resunicode);
6677 Py_DECREF(restuple);
6678 return resunicode;
6679}
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006683 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006684 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 /* input state */
6687 Py_ssize_t pos=0, size;
6688 int kind;
6689 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 /* pointer into the output */
6691 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006692 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6693 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006694 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006696 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006697 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006698 /* output object */
6699 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700
Benjamin Petersonbac79492012-01-14 13:34:47 -05006701 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 return NULL;
6703 size = PyUnicode_GET_LENGTH(unicode);
6704 kind = PyUnicode_KIND(unicode);
6705 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 /* allocate enough for a simple encoding without
6707 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006708 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006709 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006710
6711 _PyBytesWriter_Init(&writer);
6712 str = _PyBytesWriter_Alloc(&writer, size);
6713 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006717 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006722 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006726 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006729 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006731
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006732 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006734
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006735 /* Only overallocate the buffer if it's not the last write */
6736 writer.overallocate = (collend < size);
6737
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006739 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006740 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006741
6742 switch (error_handler) {
6743 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006744 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006746
6747 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006748 memset(str, '?', collend - collstart);
6749 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006750 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006751 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 break;
Victor Stinner50149202015-09-22 00:26:54 +02006754
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006755 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006756 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006757 writer.min_size -= (collend - collstart);
6758 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006759 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006760 if (str == NULL)
6761 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006762 pos = collend;
6763 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006764
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006765 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006766 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006767 writer.min_size -= (collend - collstart);
6768 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006769 unicode, collstart, collend);
6770 if (str == NULL)
6771 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 break;
Victor Stinner50149202015-09-22 00:26:54 +02006774
Victor Stinnerc3713e92015-09-29 12:32:13 +02006775 case _Py_ERROR_SURROGATEESCAPE:
6776 for (i = collstart; i < collend; ++i) {
6777 ch = PyUnicode_READ(kind, data, i);
6778 if (ch < 0xdc80 || 0xdcff < ch) {
6779 /* Not a UTF-8b surrogate */
6780 break;
6781 }
6782 *str++ = (char)(ch - 0xdc00);
6783 ++pos;
6784 }
6785 if (i >= collend)
6786 break;
6787 collstart = pos;
6788 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006789 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006790
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006792 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6793 encoding, reason, unicode, &exc,
6794 collstart, collend, &newpos);
6795 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006797
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006798 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006799 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006800
Victor Stinner6bd525b2015-10-09 13:10:05 +02006801 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006802 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006803 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006804 PyBytes_AS_STRING(rep),
6805 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006806 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006807 else {
6808 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006809
Victor Stinner6bd525b2015-10-09 13:10:05 +02006810 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006812
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006813 if (limit == 256 ?
6814 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6815 !PyUnicode_IS_ASCII(rep))
6816 {
6817 /* Not all characters are smaller than limit */
6818 raise_encode_exception(&exc, encoding, unicode,
6819 collstart, collend, reason);
6820 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006822 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6823 str = _PyBytesWriter_WriteBytes(&writer, str,
6824 PyUnicode_DATA(rep),
6825 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006827 if (str == NULL)
6828 goto onError;
6829
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006830 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006832 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006833
6834 /* If overallocation was disabled, ensure that it was the last
6835 write. Otherwise, we missed an optimization */
6836 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006837 }
6838 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006839
Victor Stinner50149202015-09-22 00:26:54 +02006840 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006841 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006843
6844 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006846 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006847 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006848 Py_XDECREF(exc);
6849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850}
6851
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006852/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853PyObject *
6854PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006855 Py_ssize_t size,
6856 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006858 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006859 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 if (unicode == NULL)
6861 return NULL;
6862 result = unicode_encode_ucs1(unicode, errors, 256);
6863 Py_DECREF(unicode);
6864 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006868_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869{
6870 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 PyErr_BadArgument();
6872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006874 if (PyUnicode_READY(unicode) == -1)
6875 return NULL;
6876 /* Fast path: if it is a one-byte string, construct
6877 bytes object directly. */
6878 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6879 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6880 PyUnicode_GET_LENGTH(unicode));
6881 /* Non-Latin-1 characters present. Defer to above function to
6882 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006884}
6885
6886PyObject*
6887PyUnicode_AsLatin1String(PyObject *unicode)
6888{
6889 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890}
6891
6892/* --- 7-bit ASCII Codec -------------------------------------------------- */
6893
Alexander Belopolsky40018472011-02-26 01:02:56 +00006894PyObject *
6895PyUnicode_DecodeASCII(const char *s,
6896 Py_ssize_t size,
6897 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006900 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006901 int kind;
6902 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t startinpos;
6904 Py_ssize_t endinpos;
6905 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006906 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006907 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006909 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006912 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006915 if (size == 1 && (unsigned char)s[0] < 128)
6916 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006917
Victor Stinner8f674cc2013-04-17 23:02:17 +02006918 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006919 writer.min_length = size;
6920 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006921 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006924 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006925 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006926 writer.pos = outpos;
6927 if (writer.pos == size)
6928 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006929
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 s += writer.pos;
6931 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006933 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006935 PyUnicode_WRITE(kind, data, writer.pos, c);
6936 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940
6941 /* byte outsize range 0x00..0x7f: call the error handler */
6942
6943 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006944 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945
6946 switch (error_handler)
6947 {
6948 case _Py_ERROR_REPLACE:
6949 case _Py_ERROR_SURROGATEESCAPE:
6950 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006951 but we may switch to UCS2 at the first write */
6952 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6953 goto onError;
6954 kind = writer.kind;
6955 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006956
6957 if (error_handler == _Py_ERROR_REPLACE)
6958 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6959 else
6960 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6961 writer.pos++;
6962 ++s;
6963 break;
6964
6965 case _Py_ERROR_IGNORE:
6966 ++s;
6967 break;
6968
6969 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 startinpos = s-starts;
6971 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006972 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 "ascii", "ordinal not in range(128)",
6975 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006976 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006978 kind = writer.kind;
6979 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006982 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006984 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006985
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006987 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006988 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 return NULL;
6991}
6992
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006993/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006994PyObject *
6995PyUnicode_EncodeASCII(const Py_UNICODE *p,
6996 Py_ssize_t size,
6997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006999 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007000 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007001 if (unicode == NULL)
7002 return NULL;
7003 result = unicode_encode_ucs1(unicode, errors, 128);
7004 Py_DECREF(unicode);
7005 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006}
7007
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007009_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
7011 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 PyErr_BadArgument();
7013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007015 if (PyUnicode_READY(unicode) == -1)
7016 return NULL;
7017 /* Fast path: if it is an ASCII-only string, construct bytes object
7018 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007019 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007020 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7021 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007023}
7024
7025PyObject *
7026PyUnicode_AsASCIIString(PyObject *unicode)
7027{
7028 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029}
7030
Steve Dowercc16be82016-09-08 10:35:16 -07007031#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007032
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007033/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007034
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007035#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007036#define NEED_RETRY
7037#endif
7038
Victor Stinner3a50e702011-10-18 21:21:00 +02007039#ifndef WC_ERR_INVALID_CHARS
7040# define WC_ERR_INVALID_CHARS 0x0080
7041#endif
7042
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007043static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007044code_page_name(UINT code_page, PyObject **obj)
7045{
7046 *obj = NULL;
7047 if (code_page == CP_ACP)
7048 return "mbcs";
7049 if (code_page == CP_UTF7)
7050 return "CP_UTF7";
7051 if (code_page == CP_UTF8)
7052 return "CP_UTF8";
7053
7054 *obj = PyBytes_FromFormat("cp%u", code_page);
7055 if (*obj == NULL)
7056 return NULL;
7057 return PyBytes_AS_STRING(*obj);
7058}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059
Victor Stinner3a50e702011-10-18 21:21:00 +02007060static DWORD
7061decode_code_page_flags(UINT code_page)
7062{
7063 if (code_page == CP_UTF7) {
7064 /* The CP_UTF7 decoder only supports flags=0 */
7065 return 0;
7066 }
7067 else
7068 return MB_ERR_INVALID_CHARS;
7069}
7070
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 * Decode a byte string from a Windows code page into unicode object in strict
7073 * mode.
7074 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007075 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7076 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007078static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007079decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007080 wchar_t **buf,
7081 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 const char *in,
7083 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007085 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007086 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
7089 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007091 while ((outsize = MultiByteToWideChar(code_page, flags,
7092 in, insize, NULL, 0)) <= 0)
7093 {
7094 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7095 goto error;
7096 }
7097 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7098 flags = 0;
7099 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007101 /* Extend a wchar_t* buffer */
7102 Py_ssize_t n = *bufsize; /* Get the current length */
7103 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7104 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007106 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107
7108 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7110 if (outsize <= 0)
7111 goto error;
7112 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114error:
7115 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7116 return -2;
7117 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007118 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119}
7120
Victor Stinner3a50e702011-10-18 21:21:00 +02007121/*
7122 * Decode a byte string from a code page into unicode object with an error
7123 * handler.
7124 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007125 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 * UnicodeDecodeError exception and returns -1 on error.
7127 */
7128static int
7129decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007130 wchar_t **buf,
7131 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007132 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007133 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007134{
7135 const char *startin = in;
7136 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007137 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 /* Ideally, we should get reason from FormatMessage. This is the Windows
7139 2000 English version of the message. */
7140 const char *reason = "No mapping for the Unicode character exists "
7141 "in the target code page.";
7142 /* each step cannot decode more than 1 character, but a character can be
7143 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007144 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007145 int insize;
7146 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 PyObject *errorHandler = NULL;
7148 PyObject *exc = NULL;
7149 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007150 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 DWORD err;
7152 int ret = -1;
7153
7154 assert(size > 0);
7155
7156 encoding = code_page_name(code_page, &encoding_obj);
7157 if (encoding == NULL)
7158 return -1;
7159
Victor Stinner7d00cc12014-03-17 23:08:06 +01007160 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7162 UnicodeDecodeError. */
7163 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7164 if (exc != NULL) {
7165 PyCodec_StrictErrors(exc);
7166 Py_CLEAR(exc);
7167 }
7168 goto error;
7169 }
7170
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007171 /* Extend a wchar_t* buffer */
7172 Py_ssize_t n = *bufsize; /* Get the current length */
7173 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7174 PyErr_NoMemory();
7175 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007177 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7178 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007180 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181
7182 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 while (in < endin)
7184 {
7185 /* Decode a character */
7186 insize = 1;
7187 do
7188 {
7189 outsize = MultiByteToWideChar(code_page, flags,
7190 in, insize,
7191 buffer, Py_ARRAY_LENGTH(buffer));
7192 if (outsize > 0)
7193 break;
7194 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007195 if (err == ERROR_INVALID_FLAGS && flags) {
7196 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7197 flags = 0;
7198 continue;
7199 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 if (err != ERROR_NO_UNICODE_TRANSLATION
7201 && err != ERROR_INSUFFICIENT_BUFFER)
7202 {
7203 PyErr_SetFromWindowsErr(0);
7204 goto error;
7205 }
7206 insize++;
7207 }
7208 /* 4=maximum length of a UTF-8 sequence */
7209 while (insize <= 4 && (in + insize) <= endin);
7210
7211 if (outsize <= 0) {
7212 Py_ssize_t startinpos, endinpos, outpos;
7213
Victor Stinner7d00cc12014-03-17 23:08:06 +01007214 /* last character in partial decode? */
7215 if (in + insize >= endin && !final)
7216 break;
7217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 startinpos = in - startin;
7219 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007220 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007221 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 errors, &errorHandler,
7223 encoding, reason,
7224 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007225 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 {
7227 goto error;
7228 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007229 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 }
7231 else {
7232 in += insize;
7233 memcpy(out, buffer, outsize * sizeof(wchar_t));
7234 out += outsize;
7235 }
7236 }
7237
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007238 /* Shrink the buffer */
7239 assert(out - *buf <= *bufsize);
7240 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007241 /* (in - startin) <= size and size is an int */
7242 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007243
7244error:
7245 Py_XDECREF(encoding_obj);
7246 Py_XDECREF(errorHandler);
7247 Py_XDECREF(exc);
7248 return ret;
7249}
7250
Victor Stinner3a50e702011-10-18 21:21:00 +02007251static PyObject *
7252decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007253 const char *s, Py_ssize_t size,
7254 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007256 wchar_t *buf = NULL;
7257 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007258 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007259
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 if (code_page < 0) {
7261 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7262 return NULL;
7263 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007264 if (size < 0) {
7265 PyErr_BadInternalCall();
7266 return NULL;
7267 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007268
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271
Victor Stinner76a31a62011-11-04 00:05:13 +01007272 do
7273 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 if (size > INT_MAX) {
7276 chunk_size = INT_MAX;
7277 final = 0;
7278 done = 0;
7279 }
7280 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007282 {
7283 chunk_size = (int)size;
7284 final = (consumed == NULL);
7285 done = 1;
7286 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007289 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007290 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007291 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007292 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007294 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 s, chunk_size);
7296 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007297 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007299 errors, final);
7300 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007301
7302 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007303 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 return NULL;
7305 }
7306
7307 if (consumed)
7308 *consumed += converted;
7309
7310 s += converted;
7311 size -= converted;
7312 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007313
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007314 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7315 PyMem_Free(buf);
7316 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317}
7318
Alexander Belopolsky40018472011-02-26 01:02:56 +00007319PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007320PyUnicode_DecodeCodePageStateful(int code_page,
7321 const char *s,
7322 Py_ssize_t size,
7323 const char *errors,
7324 Py_ssize_t *consumed)
7325{
7326 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7327}
7328
7329PyObject *
7330PyUnicode_DecodeMBCSStateful(const char *s,
7331 Py_ssize_t size,
7332 const char *errors,
7333 Py_ssize_t *consumed)
7334{
7335 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7336}
7337
7338PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007339PyUnicode_DecodeMBCS(const char *s,
7340 Py_ssize_t size,
7341 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007342{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7344}
7345
Victor Stinner3a50e702011-10-18 21:21:00 +02007346static DWORD
7347encode_code_page_flags(UINT code_page, const char *errors)
7348{
7349 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007350 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 }
7352 else if (code_page == CP_UTF7) {
7353 /* CP_UTF7 only supports flags=0 */
7354 return 0;
7355 }
7356 else {
7357 if (errors != NULL && strcmp(errors, "replace") == 0)
7358 return 0;
7359 else
7360 return WC_NO_BEST_FIT_CHARS;
7361 }
7362}
7363
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 * Encode a Unicode string to a Windows code page into a byte string in strict
7366 * mode.
7367 *
7368 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007369 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007371static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007372encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375{
Victor Stinner554f3f02010-06-16 23:33:54 +00007376 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 BOOL *pusedDefaultChar = &usedDefaultChar;
7378 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007379 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007380 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 const DWORD flags = encode_code_page_flags(code_page, NULL);
7382 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007383 /* Create a substring so that we can get the UTF-16 representation
7384 of just the slice under consideration. */
7385 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007390 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007392 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007393
Victor Stinner2fc507f2011-11-04 20:06:39 +01007394 substring = PyUnicode_Substring(unicode, offset, offset+len);
7395 if (substring == NULL)
7396 return -1;
7397 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7398 if (p == NULL) {
7399 Py_DECREF(substring);
7400 return -1;
7401 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007402 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007403
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007404 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007406 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 NULL, 0,
7408 NULL, pusedDefaultChar);
7409 if (outsize <= 0)
7410 goto error;
7411 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 if (pusedDefaultChar && *pusedDefaultChar) {
7413 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007415 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007416
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 if (*outbytes == NULL) {
7421 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425 }
7426 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const Py_ssize_t n = PyBytes_Size(*outbytes);
7429 if (outsize > PY_SSIZE_T_MAX - n) {
7430 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7435 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007439 }
7440
7441 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007443 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 out, outsize,
7445 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 if (outsize <= 0)
7448 goto error;
7449 if (pusedDefaultChar && *pusedDefaultChar)
7450 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007452
Victor Stinner3a50e702011-10-18 21:21:00 +02007453error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7456 return -2;
7457 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007458 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459}
7460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007462 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 * error handler.
7464 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007465 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 * -1 on other error.
7467 */
7468static int
7469encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007470 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007472{
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_ssize_t pos = unicode_offset;
7475 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 /* Ideally, we should get reason from FormatMessage. This is the Windows
7477 2000 English version of the message. */
7478 const char *reason = "invalid character";
7479 /* 4=maximum length of a UTF-8 sequence */
7480 char buffer[4];
7481 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7482 Py_ssize_t outsize;
7483 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 PyObject *errorHandler = NULL;
7485 PyObject *exc = NULL;
7486 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007487 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007488 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 PyObject *rep;
7490 int ret = -1;
7491
7492 assert(insize > 0);
7493
7494 encoding = code_page_name(code_page, &encoding_obj);
7495 if (encoding == NULL)
7496 return -1;
7497
7498 if (errors == NULL || strcmp(errors, "strict") == 0) {
7499 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7500 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007501 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 if (exc != NULL) {
7503 PyCodec_StrictErrors(exc);
7504 Py_DECREF(exc);
7505 }
7506 Py_XDECREF(encoding_obj);
7507 return -1;
7508 }
7509
7510 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7511 pusedDefaultChar = &usedDefaultChar;
7512 else
7513 pusedDefaultChar = NULL;
7514
7515 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7516 PyErr_NoMemory();
7517 goto error;
7518 }
7519 outsize = insize * Py_ARRAY_LENGTH(buffer);
7520
7521 if (*outbytes == NULL) {
7522 /* Create string object */
7523 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7524 if (*outbytes == NULL)
7525 goto error;
7526 out = PyBytes_AS_STRING(*outbytes);
7527 }
7528 else {
7529 /* Extend string object */
7530 Py_ssize_t n = PyBytes_Size(*outbytes);
7531 if (n > PY_SSIZE_T_MAX - outsize) {
7532 PyErr_NoMemory();
7533 goto error;
7534 }
7535 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7536 goto error;
7537 out = PyBytes_AS_STRING(*outbytes) + n;
7538 }
7539
7540 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007541 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007543 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7544 wchar_t chars[2];
7545 int charsize;
7546 if (ch < 0x10000) {
7547 chars[0] = (wchar_t)ch;
7548 charsize = 1;
7549 }
7550 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007551 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7552 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007553 charsize = 2;
7554 }
7555
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007557 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 buffer, Py_ARRAY_LENGTH(buffer),
7559 NULL, pusedDefaultChar);
7560 if (outsize > 0) {
7561 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7562 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007563 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 memcpy(out, buffer, outsize);
7565 out += outsize;
7566 continue;
7567 }
7568 }
7569 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7570 PyErr_SetFromWindowsErr(0);
7571 goto error;
7572 }
7573
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 rep = unicode_encode_call_errorhandler(
7575 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007576 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007577 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 if (rep == NULL)
7579 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007580 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007581
7582 if (PyBytes_Check(rep)) {
7583 outsize = PyBytes_GET_SIZE(rep);
7584 if (outsize != 1) {
7585 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7586 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7587 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7588 Py_DECREF(rep);
7589 goto error;
7590 }
7591 out = PyBytes_AS_STRING(*outbytes) + offset;
7592 }
7593 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7594 out += outsize;
7595 }
7596 else {
7597 Py_ssize_t i;
7598 enum PyUnicode_Kind kind;
7599 void *data;
7600
Benjamin Petersonbac79492012-01-14 13:34:47 -05007601 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 Py_DECREF(rep);
7603 goto error;
7604 }
7605
7606 outsize = PyUnicode_GET_LENGTH(rep);
7607 if (outsize != 1) {
7608 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7609 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7610 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7611 Py_DECREF(rep);
7612 goto error;
7613 }
7614 out = PyBytes_AS_STRING(*outbytes) + offset;
7615 }
7616 kind = PyUnicode_KIND(rep);
7617 data = PyUnicode_DATA(rep);
7618 for (i=0; i < outsize; i++) {
7619 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7620 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007621 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 encoding, unicode,
7623 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 "unable to encode error handler result to ASCII");
7625 Py_DECREF(rep);
7626 goto error;
7627 }
7628 *out = (unsigned char)ch;
7629 out++;
7630 }
7631 }
7632 Py_DECREF(rep);
7633 }
7634 /* write a NUL byte */
7635 *out = 0;
7636 outsize = out - PyBytes_AS_STRING(*outbytes);
7637 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7638 if (_PyBytes_Resize(outbytes, outsize) < 0)
7639 goto error;
7640 ret = 0;
7641
7642error:
7643 Py_XDECREF(encoding_obj);
7644 Py_XDECREF(errorHandler);
7645 Py_XDECREF(exc);
7646 return ret;
7647}
7648
Victor Stinner3a50e702011-10-18 21:21:00 +02007649static PyObject *
7650encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007651 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 const char *errors)
7653{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007654 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007656 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007657 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007658
Victor Stinner29dacf22015-01-26 16:41:32 +01007659 if (!PyUnicode_Check(unicode)) {
7660 PyErr_BadArgument();
7661 return NULL;
7662 }
7663
Benjamin Petersonbac79492012-01-14 13:34:47 -05007664 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007665 return NULL;
7666 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 if (code_page < 0) {
7669 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7670 return NULL;
7671 }
7672
Martin v. Löwis3d325192011-11-04 18:23:06 +01007673 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007674 return PyBytes_FromStringAndSize(NULL, 0);
7675
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 offset = 0;
7677 do
7678 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007679#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007680 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007681 chunks. */
7682 if (len > INT_MAX/2) {
7683 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 done = 0;
7685 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007686 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007687#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007688 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007689 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007690 done = 1;
7691 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007692
Victor Stinner76a31a62011-11-04 00:05:13 +01007693 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007694 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007695 errors);
7696 if (ret == -2)
7697 ret = encode_code_page_errors(code_page, &outbytes,
7698 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007700 if (ret < 0) {
7701 Py_XDECREF(outbytes);
7702 return NULL;
7703 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704
Victor Stinner7581cef2011-11-03 22:32:33 +01007705 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007707 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007708
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 return outbytes;
7710}
7711
7712PyObject *
7713PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7714 Py_ssize_t size,
7715 const char *errors)
7716{
Victor Stinner7581cef2011-11-03 22:32:33 +01007717 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007718 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 if (unicode == NULL)
7720 return NULL;
7721 res = encode_code_page(CP_ACP, unicode, errors);
7722 Py_DECREF(unicode);
7723 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007724}
7725
7726PyObject *
7727PyUnicode_EncodeCodePage(int code_page,
7728 PyObject *unicode,
7729 const char *errors)
7730{
Victor Stinner7581cef2011-11-03 22:32:33 +01007731 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007732}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007733
Alexander Belopolsky40018472011-02-26 01:02:56 +00007734PyObject *
7735PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007736{
Victor Stinner7581cef2011-11-03 22:32:33 +01007737 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007738}
7739
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007740#undef NEED_RETRY
7741
Steve Dowercc16be82016-09-08 10:35:16 -07007742#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744/* --- Character Mapping Codec -------------------------------------------- */
7745
Victor Stinnerfb161b12013-04-18 01:44:27 +02007746static int
7747charmap_decode_string(const char *s,
7748 Py_ssize_t size,
7749 PyObject *mapping,
7750 const char *errors,
7751 _PyUnicodeWriter *writer)
7752{
7753 const char *starts = s;
7754 const char *e;
7755 Py_ssize_t startinpos, endinpos;
7756 PyObject *errorHandler = NULL, *exc = NULL;
7757 Py_ssize_t maplen;
7758 enum PyUnicode_Kind mapkind;
7759 void *mapdata;
7760 Py_UCS4 x;
7761 unsigned char ch;
7762
7763 if (PyUnicode_READY(mapping) == -1)
7764 return -1;
7765
7766 maplen = PyUnicode_GET_LENGTH(mapping);
7767 mapdata = PyUnicode_DATA(mapping);
7768 mapkind = PyUnicode_KIND(mapping);
7769
7770 e = s + size;
7771
7772 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7773 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7774 * is disabled in encoding aliases, latin1 is preferred because
7775 * its implementation is faster. */
7776 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7777 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7778 Py_UCS4 maxchar = writer->maxchar;
7779
7780 assert (writer->kind == PyUnicode_1BYTE_KIND);
7781 while (s < e) {
7782 ch = *s;
7783 x = mapdata_ucs1[ch];
7784 if (x > maxchar) {
7785 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7786 goto onError;
7787 maxchar = writer->maxchar;
7788 outdata = (Py_UCS1 *)writer->data;
7789 }
7790 outdata[writer->pos] = x;
7791 writer->pos++;
7792 ++s;
7793 }
7794 return 0;
7795 }
7796
7797 while (s < e) {
7798 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7799 enum PyUnicode_Kind outkind = writer->kind;
7800 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7801 if (outkind == PyUnicode_1BYTE_KIND) {
7802 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803 Py_UCS4 maxchar = writer->maxchar;
7804 while (s < e) {
7805 ch = *s;
7806 x = mapdata_ucs2[ch];
7807 if (x > maxchar)
7808 goto Error;
7809 outdata[writer->pos] = x;
7810 writer->pos++;
7811 ++s;
7812 }
7813 break;
7814 }
7815 else if (outkind == PyUnicode_2BYTE_KIND) {
7816 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7817 while (s < e) {
7818 ch = *s;
7819 x = mapdata_ucs2[ch];
7820 if (x == 0xFFFE)
7821 goto Error;
7822 outdata[writer->pos] = x;
7823 writer->pos++;
7824 ++s;
7825 }
7826 break;
7827 }
7828 }
7829 ch = *s;
7830
7831 if (ch < maplen)
7832 x = PyUnicode_READ(mapkind, mapdata, ch);
7833 else
7834 x = 0xfffe; /* invalid value */
7835Error:
7836 if (x == 0xfffe)
7837 {
7838 /* undefined mapping */
7839 startinpos = s-starts;
7840 endinpos = startinpos+1;
7841 if (unicode_decode_call_errorhandler_writer(
7842 errors, &errorHandler,
7843 "charmap", "character maps to <undefined>",
7844 &starts, &e, &startinpos, &endinpos, &exc, &s,
7845 writer)) {
7846 goto onError;
7847 }
7848 continue;
7849 }
7850
7851 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7852 goto onError;
7853 ++s;
7854 }
7855 Py_XDECREF(errorHandler);
7856 Py_XDECREF(exc);
7857 return 0;
7858
7859onError:
7860 Py_XDECREF(errorHandler);
7861 Py_XDECREF(exc);
7862 return -1;
7863}
7864
7865static int
7866charmap_decode_mapping(const char *s,
7867 Py_ssize_t size,
7868 PyObject *mapping,
7869 const char *errors,
7870 _PyUnicodeWriter *writer)
7871{
7872 const char *starts = s;
7873 const char *e;
7874 Py_ssize_t startinpos, endinpos;
7875 PyObject *errorHandler = NULL, *exc = NULL;
7876 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007877 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007878
7879 e = s + size;
7880
7881 while (s < e) {
7882 ch = *s;
7883
7884 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7885 key = PyLong_FromLong((long)ch);
7886 if (key == NULL)
7887 goto onError;
7888
7889 item = PyObject_GetItem(mapping, key);
7890 Py_DECREF(key);
7891 if (item == NULL) {
7892 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7893 /* No mapping found means: mapping is undefined. */
7894 PyErr_Clear();
7895 goto Undefined;
7896 } else
7897 goto onError;
7898 }
7899
7900 /* Apply mapping */
7901 if (item == Py_None)
7902 goto Undefined;
7903 if (PyLong_Check(item)) {
7904 long value = PyLong_AS_LONG(item);
7905 if (value == 0xFFFE)
7906 goto Undefined;
7907 if (value < 0 || value > MAX_UNICODE) {
7908 PyErr_Format(PyExc_TypeError,
7909 "character mapping must be in range(0x%lx)",
7910 (unsigned long)MAX_UNICODE + 1);
7911 goto onError;
7912 }
7913
7914 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7915 goto onError;
7916 }
7917 else if (PyUnicode_Check(item)) {
7918 if (PyUnicode_READY(item) == -1)
7919 goto onError;
7920 if (PyUnicode_GET_LENGTH(item) == 1) {
7921 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7922 if (value == 0xFFFE)
7923 goto Undefined;
7924 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7925 goto onError;
7926 }
7927 else {
7928 writer->overallocate = 1;
7929 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7930 goto onError;
7931 }
7932 }
7933 else {
7934 /* wrong return value */
7935 PyErr_SetString(PyExc_TypeError,
7936 "character mapping must return integer, None or str");
7937 goto onError;
7938 }
7939 Py_CLEAR(item);
7940 ++s;
7941 continue;
7942
7943Undefined:
7944 /* undefined mapping */
7945 Py_CLEAR(item);
7946 startinpos = s-starts;
7947 endinpos = startinpos+1;
7948 if (unicode_decode_call_errorhandler_writer(
7949 errors, &errorHandler,
7950 "charmap", "character maps to <undefined>",
7951 &starts, &e, &startinpos, &endinpos, &exc, &s,
7952 writer)) {
7953 goto onError;
7954 }
7955 }
7956 Py_XDECREF(errorHandler);
7957 Py_XDECREF(exc);
7958 return 0;
7959
7960onError:
7961 Py_XDECREF(item);
7962 Py_XDECREF(errorHandler);
7963 Py_XDECREF(exc);
7964 return -1;
7965}
7966
Alexander Belopolsky40018472011-02-26 01:02:56 +00007967PyObject *
7968PyUnicode_DecodeCharmap(const char *s,
7969 Py_ssize_t size,
7970 PyObject *mapping,
7971 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007973 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 /* Default to Latin-1 */
7976 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007980 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007981 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007982 writer.min_length = size;
7983 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007985
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007986 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007987 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7988 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007989 }
7990 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007991 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007994 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007995
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007997 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 return NULL;
7999}
8000
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001/* Charmap encoding: the lookup table */
8002
Alexander Belopolsky40018472011-02-26 01:02:56 +00008003struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 PyObject_HEAD
8005 unsigned char level1[32];
8006 int count2, count3;
8007 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008};
8009
8010static PyObject*
8011encoding_map_size(PyObject *obj, PyObject* args)
8012{
8013 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016}
8017
8018static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 PyDoc_STR("Return the size (in bytes) of this object") },
8021 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022};
8023
8024static void
8025encoding_map_dealloc(PyObject* o)
8026{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028}
8029
8030static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008031 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 "EncodingMap", /*tp_name*/
8033 sizeof(struct encoding_map), /*tp_basicsize*/
8034 0, /*tp_itemsize*/
8035 /* methods */
8036 encoding_map_dealloc, /*tp_dealloc*/
8037 0, /*tp_print*/
8038 0, /*tp_getattr*/
8039 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008040 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 0, /*tp_repr*/
8042 0, /*tp_as_number*/
8043 0, /*tp_as_sequence*/
8044 0, /*tp_as_mapping*/
8045 0, /*tp_hash*/
8046 0, /*tp_call*/
8047 0, /*tp_str*/
8048 0, /*tp_getattro*/
8049 0, /*tp_setattro*/
8050 0, /*tp_as_buffer*/
8051 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8052 0, /*tp_doc*/
8053 0, /*tp_traverse*/
8054 0, /*tp_clear*/
8055 0, /*tp_richcompare*/
8056 0, /*tp_weaklistoffset*/
8057 0, /*tp_iter*/
8058 0, /*tp_iternext*/
8059 encoding_map_methods, /*tp_methods*/
8060 0, /*tp_members*/
8061 0, /*tp_getset*/
8062 0, /*tp_base*/
8063 0, /*tp_dict*/
8064 0, /*tp_descr_get*/
8065 0, /*tp_descr_set*/
8066 0, /*tp_dictoffset*/
8067 0, /*tp_init*/
8068 0, /*tp_alloc*/
8069 0, /*tp_new*/
8070 0, /*tp_free*/
8071 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072};
8073
8074PyObject*
8075PyUnicode_BuildEncodingMap(PyObject* string)
8076{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 PyObject *result;
8078 struct encoding_map *mresult;
8079 int i;
8080 int need_dict = 0;
8081 unsigned char level1[32];
8082 unsigned char level2[512];
8083 unsigned char *mlevel1, *mlevel2, *mlevel3;
8084 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085 int kind;
8086 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008087 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008089
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008090 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008091 PyErr_BadArgument();
8092 return NULL;
8093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 kind = PyUnicode_KIND(string);
8095 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008096 length = PyUnicode_GET_LENGTH(string);
8097 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 memset(level1, 0xFF, sizeof level1);
8099 memset(level2, 0xFF, sizeof level2);
8100
8101 /* If there isn't a one-to-one mapping of NULL to \0,
8102 or if there are non-BMP characters, we need to use
8103 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008106 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 ch = PyUnicode_READ(kind, data, i);
8109 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 need_dict = 1;
8111 break;
8112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 /* unmapped character */
8115 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 l1 = ch >> 11;
8117 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118 if (level1[l1] == 0xFF)
8119 level1[l1] = count2++;
8120 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 }
8123
8124 if (count2 >= 0xFF || count3 >= 0xFF)
8125 need_dict = 1;
8126
8127 if (need_dict) {
8128 PyObject *result = PyDict_New();
8129 PyObject *key, *value;
8130 if (!result)
8131 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008134 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 if (!key || !value)
8136 goto failed1;
8137 if (PyDict_SetItem(result, key, value) == -1)
8138 goto failed1;
8139 Py_DECREF(key);
8140 Py_DECREF(value);
8141 }
8142 return result;
8143 failed1:
8144 Py_XDECREF(key);
8145 Py_XDECREF(value);
8146 Py_DECREF(result);
8147 return NULL;
8148 }
8149
8150 /* Create a three-level trie */
8151 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8152 16*count2 + 128*count3 - 1);
8153 if (!result)
8154 return PyErr_NoMemory();
8155 PyObject_Init(result, &EncodingMapType);
8156 mresult = (struct encoding_map*)result;
8157 mresult->count2 = count2;
8158 mresult->count3 = count3;
8159 mlevel1 = mresult->level1;
8160 mlevel2 = mresult->level23;
8161 mlevel3 = mresult->level23 + 16*count2;
8162 memcpy(mlevel1, level1, 32);
8163 memset(mlevel2, 0xFF, 16*count2);
8164 memset(mlevel3, 0, 128*count3);
8165 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008166 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8169 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 /* unmapped character */
8171 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008172 o1 = ch>>11;
8173 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 i2 = 16*mlevel1[o1] + o2;
8175 if (mlevel2[i2] == 0xFF)
8176 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008177 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 i3 = 128*mlevel2[i2] + o3;
8179 mlevel3[i3] = i;
8180 }
8181 return result;
8182}
8183
8184static int
Victor Stinner22168992011-11-20 17:09:18 +01008185encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186{
8187 struct encoding_map *map = (struct encoding_map*)mapping;
8188 int l1 = c>>11;
8189 int l2 = (c>>7) & 0xF;
8190 int l3 = c & 0x7F;
8191 int i;
8192
Victor Stinner22168992011-11-20 17:09:18 +01008193 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 if (c == 0)
8196 return 0;
8197 /* level 1*/
8198 i = map->level1[l1];
8199 if (i == 0xFF) {
8200 return -1;
8201 }
8202 /* level 2*/
8203 i = map->level23[16*i+l2];
8204 if (i == 0xFF) {
8205 return -1;
8206 }
8207 /* level 3 */
8208 i = map->level23[16*map->count2 + 128*i + l3];
8209 if (i == 0) {
8210 return -1;
8211 }
8212 return i;
8213}
8214
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215/* Lookup the character ch in the mapping. If the character
8216 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008217 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008219charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
Christian Heimes217cfd12007-12-02 14:31:20 +00008221 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222 PyObject *x;
8223
8224 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008226 x = PyObject_GetItem(mapping, w);
8227 Py_DECREF(w);
8228 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8230 /* No mapping found means: mapping is undefined. */
8231 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008232 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 } else
8234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008236 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008238 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 long value = PyLong_AS_LONG(x);
8240 if (value < 0 || value > 255) {
8241 PyErr_SetString(PyExc_TypeError,
8242 "character mapping must be in range(256)");
8243 Py_DECREF(x);
8244 return NULL;
8245 }
8246 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008248 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 /* wrong return value */
8252 PyErr_Format(PyExc_TypeError,
8253 "character mapping must return integer, bytes or None, not %.400s",
8254 x->ob_type->tp_name);
8255 Py_DECREF(x);
8256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
8258}
8259
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008261charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8264 /* exponentially overallocate to minimize reallocations */
8265 if (requiredsize < 2*outsize)
8266 requiredsize = 2*outsize;
8267 if (_PyBytes_Resize(outobj, requiredsize))
8268 return -1;
8269 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270}
8271
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008274} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008276 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 space is available. Return a new reference to the object that
8278 was put in the output buffer, or Py_None, if the mapping was undefined
8279 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008280 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008281static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008282charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008283 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285 PyObject *rep;
8286 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008287 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288
Christian Heimes90aa7642007-12-19 02:45:37 +00008289 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292 if (res == -1)
8293 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 if (outsize<requiredsize)
8295 if (charmapencode_resize(outobj, outpos, requiredsize))
8296 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008297 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 outstart[(*outpos)++] = (char)res;
8299 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008300 }
8301
8302 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 Py_DECREF(rep);
8307 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (PyLong_Check(rep)) {
8310 Py_ssize_t requiredsize = *outpos+1;
8311 if (outsize<requiredsize)
8312 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8313 Py_DECREF(rep);
8314 return enc_EXCEPTION;
8315 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008316 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 else {
8320 const char *repchars = PyBytes_AS_STRING(rep);
8321 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8322 Py_ssize_t requiredsize = *outpos+repsize;
8323 if (outsize<requiredsize)
8324 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8325 Py_DECREF(rep);
8326 return enc_EXCEPTION;
8327 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008328 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 memcpy(outstart + *outpos, repchars, repsize);
8330 *outpos += repsize;
8331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 Py_DECREF(rep);
8334 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335}
8336
8337/* handle an error in PyUnicode_EncodeCharmap
8338 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339static int
8340charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008343 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008344 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345{
8346 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008347 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008348 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008349 enum PyUnicode_Kind kind;
8350 void *data;
8351 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008353 Py_ssize_t collstartpos = *inpos;
8354 Py_ssize_t collendpos = *inpos+1;
8355 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008356 const char *encoding = "charmap";
8357 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008359 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008360 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361
Benjamin Petersonbac79492012-01-14 13:34:47 -05008362 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363 return -1;
8364 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 /* find all unencodable characters */
8366 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008367 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008368 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008370 val = encoding_map_lookup(ch, mapping);
8371 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 break;
8373 ++collendpos;
8374 continue;
8375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008377 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8378 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (rep==NULL)
8380 return -1;
8381 else if (rep!=Py_None) {
8382 Py_DECREF(rep);
8383 break;
8384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 }
8388 /* cache callback name lookup
8389 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008390 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008391 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008392
8393 switch (*error_handler) {
8394 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008395 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008397
8398 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008399 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 x = charmapencode_output('?', mapping, res, respos);
8401 if (x==enc_EXCEPTION) {
8402 return -1;
8403 }
8404 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008405 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return -1;
8407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 }
8409 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008410 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 *inpos = collendpos;
8412 break;
Victor Stinner50149202015-09-22 00:26:54 +02008413
8414 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 /* generate replacement (temporarily (mis)uses p) */
8416 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 char buffer[2+29+1+1];
8418 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 for (cp = buffer; *cp; ++cp) {
8421 x = charmapencode_output(*cp, mapping, res, respos);
8422 if (x==enc_EXCEPTION)
8423 return -1;
8424 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008425 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return -1;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 }
8429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 *inpos = collendpos;
8431 break;
Victor Stinner50149202015-09-22 00:26:54 +02008432
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 default:
Victor Stinner50149202015-09-22 00:26:54 +02008434 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008435 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008439 if (PyBytes_Check(repunicode)) {
8440 /* Directly copy bytes result to output. */
8441 Py_ssize_t outsize = PyBytes_Size(*res);
8442 Py_ssize_t requiredsize;
8443 repsize = PyBytes_Size(repunicode);
8444 requiredsize = *respos + repsize;
8445 if (requiredsize > outsize)
8446 /* Make room for all additional bytes. */
8447 if (charmapencode_resize(res, respos, requiredsize)) {
8448 Py_DECREF(repunicode);
8449 return -1;
8450 }
8451 memcpy(PyBytes_AsString(*res) + *respos,
8452 PyBytes_AsString(repunicode), repsize);
8453 *respos += repsize;
8454 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008455 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008456 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008459 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008460 Py_DECREF(repunicode);
8461 return -1;
8462 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008463 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008464 data = PyUnicode_DATA(repunicode);
8465 kind = PyUnicode_KIND(repunicode);
8466 for (index = 0; index < repsize; index++) {
8467 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8468 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008470 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return -1;
8472 }
8473 else if (x==enc_FAILED) {
8474 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008475 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return -1;
8477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 }
8479 *inpos = newpos;
8480 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 }
8482 return 0;
8483}
8484
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008486_PyUnicode_EncodeCharmap(PyObject *unicode,
8487 PyObject *mapping,
8488 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 /* output object */
8491 PyObject *res = NULL;
8492 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008494 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008496 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008497 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008499 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008500 void *data;
8501 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502
Benjamin Petersonbac79492012-01-14 13:34:47 -05008503 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 return NULL;
8505 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008506 data = PyUnicode_DATA(unicode);
8507 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 /* Default to Latin-1 */
8510 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 /* allocate enough for a simple encoding without
8514 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008515 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 if (res == NULL)
8517 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008518 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008522 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008524 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 if (x==enc_EXCEPTION) /* error */
8526 goto onError;
8527 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008530 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 &res, &respos)) {
8532 goto onError;
8533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 else
8536 /* done with this character => adjust input position */
8537 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008541 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008542 if (_PyBytes_Resize(&res, respos) < 0)
8543 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008546 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 return res;
8548
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 Py_XDECREF(res);
8551 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008552 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 return NULL;
8554}
8555
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556/* Deprecated */
8557PyObject *
8558PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8559 Py_ssize_t size,
8560 PyObject *mapping,
8561 const char *errors)
8562{
8563 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008564 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565 if (unicode == NULL)
8566 return NULL;
8567 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8568 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008569 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570}
8571
Alexander Belopolsky40018472011-02-26 01:02:56 +00008572PyObject *
8573PyUnicode_AsCharmapString(PyObject *unicode,
8574 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575{
8576 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 PyErr_BadArgument();
8578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581}
8582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584static void
8585make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008587 Py_ssize_t startpos, Py_ssize_t endpos,
8588 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 *exceptionObject = _PyUnicodeTranslateError_Create(
8592 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 }
8594 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8596 goto onError;
8597 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8598 goto onError;
8599 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8600 goto onError;
8601 return;
8602 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008603 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 }
8605}
8606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607/* error handling callback helper:
8608 build arguments, call the callback and check the arguments,
8609 put the result into newpos and return the replacement string, which
8610 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008611static PyObject *
8612unicode_translate_call_errorhandler(const char *errors,
8613 PyObject **errorHandler,
8614 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616 Py_ssize_t startpos, Py_ssize_t endpos,
8617 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008619 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008621 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 PyObject *restuple;
8623 PyObject *resunicode;
8624
8625 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 }
8630
8631 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008636 restuple = PyObject_CallFunctionObjArgs(
8637 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008641 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 Py_DECREF(restuple);
8643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008645 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 &resunicode, &i_newpos)) {
8647 Py_DECREF(restuple);
8648 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008650 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008652 else
8653 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008655 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 Py_DECREF(restuple);
8657 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008658 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 Py_INCREF(resunicode);
8660 Py_DECREF(restuple);
8661 return resunicode;
8662}
8663
8664/* Lookup the character ch in the mapping and put the result in result,
8665 which must be decrefed by the caller.
8666 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669{
Christian Heimes217cfd12007-12-02 14:31:20 +00008670 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 PyObject *x;
8672
8673 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 x = PyObject_GetItem(mapping, w);
8676 Py_DECREF(w);
8677 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8679 /* No mapping found means: use 1:1 mapping. */
8680 PyErr_Clear();
8681 *result = NULL;
8682 return 0;
8683 } else
8684 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 }
8686 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 *result = x;
8688 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008690 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008692 if (value < 0 || value > MAX_UNICODE) {
8693 PyErr_Format(PyExc_ValueError,
8694 "character mapping must be in range(0x%x)",
8695 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 Py_DECREF(x);
8697 return -1;
8698 }
8699 *result = x;
8700 return 0;
8701 }
8702 else if (PyUnicode_Check(x)) {
8703 *result = x;
8704 return 0;
8705 }
8706 else {
8707 /* wrong return value */
8708 PyErr_SetString(PyExc_TypeError,
8709 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008710 Py_DECREF(x);
8711 return -1;
8712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713}
Victor Stinner1194ea02014-04-04 19:37:40 +02008714
8715/* lookup the character, write the result into the writer.
8716 Return 1 if the result was written into the writer, return 0 if the mapping
8717 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008718static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008719charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8720 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721{
Victor Stinner1194ea02014-04-04 19:37:40 +02008722 PyObject *item;
8723
8724 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008726
8727 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008729 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008732 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008734
8735 if (item == Py_None) {
8736 Py_DECREF(item);
8737 return 0;
8738 }
8739
8740 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008741 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8742 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8743 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008744 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8745 Py_DECREF(item);
8746 return -1;
8747 }
8748 Py_DECREF(item);
8749 return 1;
8750 }
8751
8752 if (!PyUnicode_Check(item)) {
8753 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008755 }
8756
8757 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8758 Py_DECREF(item);
8759 return -1;
8760 }
8761
8762 Py_DECREF(item);
8763 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764}
8765
Victor Stinner89a76ab2014-04-05 11:44:04 +02008766static int
8767unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8768 Py_UCS1 *translate)
8769{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008770 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008771 int ret = 0;
8772
Victor Stinner89a76ab2014-04-05 11:44:04 +02008773 if (charmaptranslate_lookup(ch, mapping, &item)) {
8774 return -1;
8775 }
8776
8777 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008778 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008779 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008780 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008781 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 /* not found => default to 1:1 mapping */
8783 translate[ch] = ch;
8784 return 1;
8785 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008786 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008787 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008788 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8789 used it */
8790 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791 /* invalid character or character outside ASCII:
8792 skip the fast translate */
8793 goto exit;
8794 }
8795 translate[ch] = (Py_UCS1)replace;
8796 }
8797 else if (PyUnicode_Check(item)) {
8798 Py_UCS4 replace;
8799
8800 if (PyUnicode_READY(item) == -1) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804 if (PyUnicode_GET_LENGTH(item) != 1)
8805 goto exit;
8806
8807 replace = PyUnicode_READ_CHAR(item, 0);
8808 if (replace > 127)
8809 goto exit;
8810 translate[ch] = (Py_UCS1)replace;
8811 }
8812 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008813 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814 goto exit;
8815 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 ret = 1;
8817
Benjamin Peterson1365de72014-04-07 20:15:41 -04008818 exit:
8819 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 return ret;
8821}
8822
8823/* Fast path for ascii => ascii translation. Return 1 if the whole string
8824 was translated into writer, return 0 if the input string was partially
8825 translated into writer, raise an exception and return -1 on error. */
8826static int
8827unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008828 _PyUnicodeWriter *writer, int ignore,
8829 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830{
Victor Stinner872b2912014-04-05 14:27:07 +02008831 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 Py_ssize_t len;
8833 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008834 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 len = PyUnicode_GET_LENGTH(input);
8837
Victor Stinner872b2912014-04-05 14:27:07 +02008838 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839
8840 in = PyUnicode_1BYTE_DATA(input);
8841 end = in + len;
8842
8843 assert(PyUnicode_IS_ASCII(writer->buffer));
8844 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8845 out = PyUnicode_1BYTE_DATA(writer->buffer);
8846
Victor Stinner872b2912014-04-05 14:27:07 +02008847 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008849 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008851 int translate = unicode_fast_translate_lookup(mapping, ch,
8852 ascii_table);
8853 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008855 if (translate == 0)
8856 goto exit;
8857 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 }
Victor Stinner872b2912014-04-05 14:27:07 +02008859 if (ch2 == 0xfe) {
8860 if (ignore)
8861 continue;
8862 goto exit;
8863 }
8864 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008866 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 }
Victor Stinner872b2912014-04-05 14:27:07 +02008868 res = 1;
8869
8870exit:
8871 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008872 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008873 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874}
8875
Victor Stinner3222da22015-10-01 22:07:32 +02008876static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877_PyUnicode_TranslateCharmap(PyObject *input,
8878 PyObject *mapping,
8879 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008882 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 Py_ssize_t size, i;
8884 int kind;
8885 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 _PyUnicodeWriter writer;
8887 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008888 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 PyObject *errorHandler = NULL;
8890 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008891 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 PyErr_BadArgument();
8896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 if (PyUnicode_READY(input) == -1)
8900 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008901 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 kind = PyUnicode_KIND(input);
8903 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008905 if (size == 0)
8906 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 /* allocate enough for a simple 1:1 translation without
8909 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008910 _PyUnicodeWriter_Init(&writer);
8911 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913
Victor Stinner872b2912014-04-05 14:27:07 +02008914 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8915
Victor Stinner33798672016-03-01 21:59:58 +01008916 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008918 if (PyUnicode_IS_ASCII(input)) {
8919 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8920 if (res < 0) {
8921 _PyUnicodeWriter_Dealloc(&writer);
8922 return NULL;
8923 }
8924 if (res == 1)
8925 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 }
Victor Stinner33798672016-03-01 21:59:58 +01008927 else {
8928 i = 0;
8929 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 int translate;
8934 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8935 Py_ssize_t newpos;
8936 /* startpos for collecting untranslatable chars */
8937 Py_ssize_t collstart;
8938 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940
Victor Stinner1194ea02014-04-04 19:37:40 +02008941 ch = PyUnicode_READ(kind, data, i);
8942 translate = charmaptranslate_output(ch, mapping, &writer);
8943 if (translate < 0)
8944 goto onError;
8945
8946 if (translate != 0) {
8947 /* it worked => adjust input pointer */
8948 ++i;
8949 continue;
8950 }
8951
8952 /* untranslatable character */
8953 collstart = i;
8954 collend = i+1;
8955
8956 /* find all untranslatable characters */
8957 while (collend < size) {
8958 PyObject *x;
8959 ch = PyUnicode_READ(kind, data, collend);
8960 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008961 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 Py_XDECREF(x);
8963 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 ++collend;
8966 }
8967
8968 if (ignore) {
8969 i = collend;
8970 }
8971 else {
8972 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8973 reason, input, &exc,
8974 collstart, collend, &newpos);
8975 if (repunicode == NULL)
8976 goto onError;
8977 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008980 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 Py_DECREF(repunicode);
8982 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008983 }
8984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 Py_XDECREF(exc);
8986 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008991 Py_XDECREF(exc);
8992 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 return NULL;
8994}
8995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996/* Deprecated. Use PyUnicode_Translate instead. */
8997PyObject *
8998PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8999 Py_ssize_t size,
9000 PyObject *mapping,
9001 const char *errors)
9002{
Christian Heimes5f520f42012-09-11 14:03:25 +02009003 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009004 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 if (!unicode)
9006 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009007 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9008 Py_DECREF(unicode);
9009 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010}
9011
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012PyObject *
9013PyUnicode_Translate(PyObject *str,
9014 PyObject *mapping,
9015 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009017 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009018 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009019 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020}
Tim Petersced69f82003-09-16 20:30:58 +00009021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022PyObject *
9023_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9024{
9025 if (!PyUnicode_Check(unicode)) {
9026 PyErr_BadInternalCall();
9027 return NULL;
9028 }
9029 if (PyUnicode_READY(unicode) == -1)
9030 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009031 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 /* If the string is already ASCII, just return the same string */
9033 Py_INCREF(unicode);
9034 return unicode;
9035 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009036
9037 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9038 PyObject *result = PyUnicode_New(len, 127);
9039 if (result == NULL) {
9040 return NULL;
9041 }
9042
9043 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9044 int kind = PyUnicode_KIND(unicode);
9045 const void *data = PyUnicode_DATA(unicode);
9046 Py_ssize_t i;
9047 for (i = 0; i < len; ++i) {
9048 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9049 if (ch < 127) {
9050 out[i] = ch;
9051 }
9052 else if (Py_UNICODE_ISSPACE(ch)) {
9053 out[i] = ' ';
9054 }
9055 else {
9056 int decimal = Py_UNICODE_TODECIMAL(ch);
9057 if (decimal < 0) {
9058 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009059 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009060 _PyUnicode_LENGTH(result) = i + 1;
9061 break;
9062 }
9063 out[i] = '0' + decimal;
9064 }
9065 }
9066
INADA Naoki16dfca42018-07-14 12:06:43 +09009067 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009068 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069}
9070
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009071PyObject *
9072PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9073 Py_ssize_t length)
9074{
Victor Stinnerf0124502011-11-21 23:12:56 +01009075 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009076 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009077 Py_UCS4 maxchar;
9078 enum PyUnicode_Kind kind;
9079 void *data;
9080
Victor Stinner99d7ad02012-02-22 13:37:39 +01009081 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009082 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009083 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084 if (ch > 127) {
9085 int decimal = Py_UNICODE_TODECIMAL(ch);
9086 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009087 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009088 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009089 }
9090 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009091
9092 /* Copy to a new string */
9093 decimal = PyUnicode_New(length, maxchar);
9094 if (decimal == NULL)
9095 return decimal;
9096 kind = PyUnicode_KIND(decimal);
9097 data = PyUnicode_DATA(decimal);
9098 /* Iterate over code points */
9099 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009100 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009101 if (ch > 127) {
9102 int decimal = Py_UNICODE_TODECIMAL(ch);
9103 if (decimal >= 0)
9104 ch = '0' + decimal;
9105 }
9106 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009108 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009109}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009110/* --- Decimal Encoder ---------------------------------------------------- */
9111
Alexander Belopolsky40018472011-02-26 01:02:56 +00009112int
9113PyUnicode_EncodeDecimal(Py_UNICODE *s,
9114 Py_ssize_t length,
9115 char *output,
9116 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009117{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009118 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009119 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009120 enum PyUnicode_Kind kind;
9121 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009122
9123 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 PyErr_BadArgument();
9125 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009126 }
9127
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009128 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009129 if (unicode == NULL)
9130 return -1;
9131
Victor Stinner42bf7752011-11-21 22:52:58 +01009132 kind = PyUnicode_KIND(unicode);
9133 data = PyUnicode_DATA(unicode);
9134
Victor Stinnerb84d7232011-11-22 01:50:07 +01009135 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009136 PyObject *exc;
9137 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009139 Py_ssize_t startpos;
9140
9141 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009142
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009144 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009145 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009147 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 decimal = Py_UNICODE_TODECIMAL(ch);
9149 if (decimal >= 0) {
9150 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009151 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 continue;
9153 }
9154 if (0 < ch && ch < 256) {
9155 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009156 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 continue;
9158 }
Victor Stinner6345be92011-11-25 20:09:01 +01009159
Victor Stinner42bf7752011-11-21 22:52:58 +01009160 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009161 exc = NULL;
9162 raise_encode_exception(&exc, "decimal", unicode,
9163 startpos, startpos+1,
9164 "invalid decimal Unicode string");
9165 Py_XDECREF(exc);
9166 Py_DECREF(unicode);
9167 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009168 }
9169 /* 0-terminate the output string */
9170 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009171 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173}
9174
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175/* --- Helpers ------------------------------------------------------------ */
9176
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009177/* helper macro to fixup start/end slice values */
9178#define ADJUST_INDICES(start, end, len) \
9179 if (end > len) \
9180 end = len; \
9181 else if (end < 0) { \
9182 end += len; \
9183 if (end < 0) \
9184 end = 0; \
9185 } \
9186 if (start < 0) { \
9187 start += len; \
9188 if (start < 0) \
9189 start = 0; \
9190 }
9191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009193any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009195 Py_ssize_t end,
9196 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009198 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 void *buf1, *buf2;
9200 Py_ssize_t len1, len2, result;
9201
9202 kind1 = PyUnicode_KIND(s1);
9203 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009204 if (kind1 < kind2)
9205 return -1;
9206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 len1 = PyUnicode_GET_LENGTH(s1);
9208 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009209 ADJUST_INDICES(start, end, len1);
9210 if (end - start < len2)
9211 return -1;
9212
9213 buf1 = PyUnicode_DATA(s1);
9214 buf2 = PyUnicode_DATA(s2);
9215 if (len2 == 1) {
9216 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9217 result = findchar((const char *)buf1 + kind1*start,
9218 kind1, end - start, ch, direction);
9219 if (result == -1)
9220 return -1;
9221 else
9222 return start + result;
9223 }
9224
9225 if (kind2 != kind1) {
9226 buf2 = _PyUnicode_AsKind(s2, kind1);
9227 if (!buf2)
9228 return -2;
9229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230
Victor Stinner794d5672011-10-10 03:21:36 +02009231 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009232 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009233 case PyUnicode_1BYTE_KIND:
9234 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9235 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9236 else
9237 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 case PyUnicode_2BYTE_KIND:
9240 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 case PyUnicode_4BYTE_KIND:
9243 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009246 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009247 }
9248 }
9249 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009250 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009251 case PyUnicode_1BYTE_KIND:
9252 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9253 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9254 else
9255 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9256 break;
9257 case PyUnicode_2BYTE_KIND:
9258 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9259 break;
9260 case PyUnicode_4BYTE_KIND:
9261 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9262 break;
9263 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009264 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 }
9267
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009268 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 PyMem_Free(buf2);
9270
9271 return result;
9272}
9273
Victor Stinner59423e32018-11-26 13:40:01 +01009274/* _PyUnicode_InsertThousandsGrouping() helper functions */
9275#include "stringlib/localeutil.h"
9276
9277/**
9278 * InsertThousandsGrouping:
9279 * @writer: Unicode writer.
9280 * @n_buffer: Number of characters in @buffer.
9281 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9282 * @d_pos: Start of digits string.
9283 * @n_digits: The number of digits in the string, in which we want
9284 * to put the grouping chars.
9285 * @min_width: The minimum width of the digits in the output string.
9286 * Output will be zero-padded on the left to fill.
9287 * @grouping: see definition in localeconv().
9288 * @thousands_sep: see definition in localeconv().
9289 *
9290 * There are 2 modes: counting and filling. If @writer is NULL,
9291 * we are in counting mode, else filling mode.
9292 * If counting, the required buffer size is returned.
9293 * If filling, we know the buffer will be large enough, so we don't
9294 * need to pass in the buffer size.
9295 * Inserts thousand grouping characters (as defined by grouping and
9296 * thousands_sep) into @writer.
9297 *
9298 * Return value: -1 on error, number of characters otherwise.
9299 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009301_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009302 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009303 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009304 PyObject *digits,
9305 Py_ssize_t d_pos,
9306 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009307 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009308 const char *grouping,
9309 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311{
Xtreak3f7983a2019-01-07 20:39:14 +05309312 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009313 if (writer) {
9314 assert(digits != NULL);
9315 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 }
9317 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009318 assert(digits == NULL);
9319 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 }
Victor Stinner59423e32018-11-26 13:40:01 +01009321 assert(0 <= d_pos);
9322 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009323 assert(grouping != NULL);
9324
9325 if (digits != NULL) {
9326 if (PyUnicode_READY(digits) == -1) {
9327 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009328 }
Victor Stinner59423e32018-11-26 13:40:01 +01009329 }
9330 if (PyUnicode_READY(thousands_sep) == -1) {
9331 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 }
9333
Victor Stinner59423e32018-11-26 13:40:01 +01009334 Py_ssize_t count = 0;
9335 Py_ssize_t n_zeros;
9336 int loop_broken = 0;
9337 int use_separator = 0; /* First time through, don't append the
9338 separator. They only go between
9339 groups. */
9340 Py_ssize_t buffer_pos;
9341 Py_ssize_t digits_pos;
9342 Py_ssize_t len;
9343 Py_ssize_t n_chars;
9344 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9345 be looked at */
9346 /* A generator that returns all of the grouping widths, until it
9347 returns 0. */
9348 GroupGenerator groupgen;
9349 GroupGenerator_init(&groupgen, grouping);
9350 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9351
9352 /* if digits are not grouped, thousands separator
9353 should be an empty string */
9354 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9355
9356 digits_pos = d_pos + n_digits;
9357 if (writer) {
9358 buffer_pos = writer->pos + n_buffer;
9359 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9360 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 }
Victor Stinner59423e32018-11-26 13:40:01 +01009362 else {
9363 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009364 }
Victor Stinner59423e32018-11-26 13:40:01 +01009365
9366 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 }
Victor Stinner59423e32018-11-26 13:40:01 +01009369
9370 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9371 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9372 n_zeros = Py_MAX(0, len - remaining);
9373 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9374
9375 /* Use n_zero zero's and n_chars chars */
9376
9377 /* Count only, don't do anything. */
9378 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9379
9380 /* Copy into the writer. */
9381 InsertThousandsGrouping_fill(writer, &buffer_pos,
9382 digits, &digits_pos,
9383 n_chars, n_zeros,
9384 use_separator ? thousands_sep : NULL,
9385 thousands_sep_len, maxchar);
9386
9387 /* Use a separator next time. */
9388 use_separator = 1;
9389
9390 remaining -= n_chars;
9391 min_width -= len;
9392
9393 if (remaining <= 0 && min_width <= 0) {
9394 loop_broken = 1;
9395 break;
9396 }
9397 min_width -= thousands_sep_len;
9398 }
9399 if (!loop_broken) {
9400 /* We left the loop without using a break statement. */
9401
9402 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9403 n_zeros = Py_MAX(0, len - remaining);
9404 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9405
9406 /* Use n_zero zero's and n_chars chars */
9407 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9408
9409 /* Copy into the writer. */
9410 InsertThousandsGrouping_fill(writer, &buffer_pos,
9411 digits, &digits_pos,
9412 n_chars, n_zeros,
9413 use_separator ? thousands_sep : NULL,
9414 thousands_sep_len, maxchar);
9415 }
9416 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417}
9418
9419
Alexander Belopolsky40018472011-02-26 01:02:56 +00009420Py_ssize_t
9421PyUnicode_Count(PyObject *str,
9422 PyObject *substr,
9423 Py_ssize_t start,
9424 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009426 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 void *buf1 = NULL, *buf2 = NULL;
9429 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009433
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 kind1 = PyUnicode_KIND(str);
9435 kind2 = PyUnicode_KIND(substr);
9436 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 len1 = PyUnicode_GET_LENGTH(str);
9440 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009442 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009443 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009444
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009445 buf1 = PyUnicode_DATA(str);
9446 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009447 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009448 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 if (!buf2)
9450 goto onError;
9451 }
9452
9453 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009455 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009456 result = asciilib_count(
9457 ((Py_UCS1*)buf1) + start, end - start,
9458 buf2, len2, PY_SSIZE_T_MAX
9459 );
9460 else
9461 result = ucs1lib_count(
9462 ((Py_UCS1*)buf1) + start, end - start,
9463 buf2, len2, PY_SSIZE_T_MAX
9464 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 break;
9466 case PyUnicode_2BYTE_KIND:
9467 result = ucs2lib_count(
9468 ((Py_UCS2*)buf1) + start, end - start,
9469 buf2, len2, PY_SSIZE_T_MAX
9470 );
9471 break;
9472 case PyUnicode_4BYTE_KIND:
9473 result = ucs4lib_count(
9474 ((Py_UCS4*)buf1) + start, end - start,
9475 buf2, len2, PY_SSIZE_T_MAX
9476 );
9477 break;
9478 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009479 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009481
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009482 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 PyMem_Free(buf2);
9484
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 PyMem_Free(buf2);
9489 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490}
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492Py_ssize_t
9493PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009494 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009495 Py_ssize_t start,
9496 Py_ssize_t end,
9497 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009499 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009501
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009502 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505Py_ssize_t
9506PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9507 Py_ssize_t start, Py_ssize_t end,
9508 int direction)
9509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009511 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 if (PyUnicode_READY(str) == -1)
9513 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009514 len = PyUnicode_GET_LENGTH(str);
9515 ADJUST_INDICES(start, end, len);
9516 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009519 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9520 kind, end-start, ch, direction);
9521 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009523 else
9524 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525}
9526
Alexander Belopolsky40018472011-02-26 01:02:56 +00009527static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009528tailmatch(PyObject *self,
9529 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009530 Py_ssize_t start,
9531 Py_ssize_t end,
9532 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 int kind_self;
9535 int kind_sub;
9536 void *data_self;
9537 void *data_sub;
9538 Py_ssize_t offset;
9539 Py_ssize_t i;
9540 Py_ssize_t end_sub;
9541
9542 if (PyUnicode_READY(self) == -1 ||
9543 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009544 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9547 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009551 if (PyUnicode_GET_LENGTH(substring) == 0)
9552 return 1;
9553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 kind_self = PyUnicode_KIND(self);
9555 data_self = PyUnicode_DATA(self);
9556 kind_sub = PyUnicode_KIND(substring);
9557 data_sub = PyUnicode_DATA(substring);
9558 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9559
9560 if (direction > 0)
9561 offset = end;
9562 else
9563 offset = start;
9564
9565 if (PyUnicode_READ(kind_self, data_self, offset) ==
9566 PyUnicode_READ(kind_sub, data_sub, 0) &&
9567 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9568 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9569 /* If both are of the same kind, memcmp is sufficient */
9570 if (kind_self == kind_sub) {
9571 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009572 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 data_sub,
9574 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009575 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009577 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 else {
9579 /* We do not need to compare 0 and len(substring)-1 because
9580 the if statement above ensured already that they are equal
9581 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 for (i = 1; i < end_sub; ++i) {
9583 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9584 PyUnicode_READ(kind_sub, data_sub, i))
9585 return 0;
9586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 }
9590
9591 return 0;
9592}
9593
Alexander Belopolsky40018472011-02-26 01:02:56 +00009594Py_ssize_t
9595PyUnicode_Tailmatch(PyObject *str,
9596 PyObject *substr,
9597 Py_ssize_t start,
9598 Py_ssize_t end,
9599 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009601 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009603
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009604 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605}
9606
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009607static PyObject *
9608ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9611 char *resdata, *data = PyUnicode_DATA(self);
9612 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009613
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009614 res = PyUnicode_New(len, 127);
9615 if (res == NULL)
9616 return NULL;
9617 resdata = PyUnicode_DATA(res);
9618 if (lower)
9619 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009621 _Py_bytes_upper(resdata, data, len);
9622 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623}
9624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009626handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009628 Py_ssize_t j;
9629 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009630 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009632
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009633 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9634
9635 where ! is a negation and \p{xxx} is a character with property xxx.
9636 */
9637 for (j = i - 1; j >= 0; j--) {
9638 c = PyUnicode_READ(kind, data, j);
9639 if (!_PyUnicode_IsCaseIgnorable(c))
9640 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9643 if (final_sigma) {
9644 for (j = i + 1; j < length; j++) {
9645 c = PyUnicode_READ(kind, data, j);
9646 if (!_PyUnicode_IsCaseIgnorable(c))
9647 break;
9648 }
9649 final_sigma = j == length || !_PyUnicode_IsCased(c);
9650 }
9651 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654static int
9655lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9656 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 /* Obscure special case. */
9659 if (c == 0x3A3) {
9660 mapped[0] = handle_capital_sigma(kind, data, length, i);
9661 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664}
9665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666static Py_ssize_t
9667do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 Py_ssize_t i, k = 0;
9670 int n_res, j;
9671 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009672
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009674 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009676 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 for (i = 1; i < length; i++) {
9680 c = PyUnicode_READ(kind, data, i);
9681 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9682 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009683 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009685 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009686 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688}
9689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690static Py_ssize_t
9691do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9692 Py_ssize_t i, k = 0;
9693
9694 for (i = 0; i < length; i++) {
9695 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9696 int n_res, j;
9697 if (Py_UNICODE_ISUPPER(c)) {
9698 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9699 }
9700 else if (Py_UNICODE_ISLOWER(c)) {
9701 n_res = _PyUnicode_ToUpperFull(c, mapped);
9702 }
9703 else {
9704 n_res = 1;
9705 mapped[0] = c;
9706 }
9707 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009708 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 res[k++] = mapped[j];
9710 }
9711 }
9712 return k;
9713}
9714
9715static Py_ssize_t
9716do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9717 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 Py_ssize_t i, k = 0;
9720
9721 for (i = 0; i < length; i++) {
9722 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9723 int n_res, j;
9724 if (lower)
9725 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9726 else
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009729 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 res[k++] = mapped[j];
9731 }
9732 }
9733 return k;
9734}
9735
9736static Py_ssize_t
9737do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9738{
9739 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9740}
9741
9742static Py_ssize_t
9743do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9744{
9745 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9746}
9747
Benjamin Petersone51757f2012-01-12 21:10:29 -05009748static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009749do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9750{
9751 Py_ssize_t i, k = 0;
9752
9753 for (i = 0; i < length; i++) {
9754 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9755 Py_UCS4 mapped[3];
9756 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9757 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009758 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009759 res[k++] = mapped[j];
9760 }
9761 }
9762 return k;
9763}
9764
9765static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009766do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9767{
9768 Py_ssize_t i, k = 0;
9769 int previous_is_cased;
9770
9771 previous_is_cased = 0;
9772 for (i = 0; i < length; i++) {
9773 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9774 Py_UCS4 mapped[3];
9775 int n_res, j;
9776
9777 if (previous_is_cased)
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 else
9780 n_res = _PyUnicode_ToTitleFull(c, mapped);
9781
9782 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009783 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009784 res[k++] = mapped[j];
9785 }
9786
9787 previous_is_cased = _PyUnicode_IsCased(c);
9788 }
9789 return k;
9790}
9791
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009792static PyObject *
9793case_operation(PyObject *self,
9794 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9795{
9796 PyObject *res = NULL;
9797 Py_ssize_t length, newlength = 0;
9798 int kind, outkind;
9799 void *data, *outdata;
9800 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9801
Benjamin Petersoneea48462012-01-16 14:28:50 -05009802 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009803
9804 kind = PyUnicode_KIND(self);
9805 data = PyUnicode_DATA(self);
9806 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009807 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009808 PyErr_SetString(PyExc_OverflowError, "string is too long");
9809 return NULL;
9810 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009811 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 if (tmp == NULL)
9813 return PyErr_NoMemory();
9814 newlength = perform(kind, data, length, tmp, &maxchar);
9815 res = PyUnicode_New(newlength, maxchar);
9816 if (res == NULL)
9817 goto leave;
9818 tmpend = tmp + newlength;
9819 outdata = PyUnicode_DATA(res);
9820 outkind = PyUnicode_KIND(res);
9821 switch (outkind) {
9822 case PyUnicode_1BYTE_KIND:
9823 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9824 break;
9825 case PyUnicode_2BYTE_KIND:
9826 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9827 break;
9828 case PyUnicode_4BYTE_KIND:
9829 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9830 break;
9831 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009832 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 }
9834 leave:
9835 PyMem_FREE(tmp);
9836 return res;
9837}
9838
Tim Peters8ce9f162004-08-27 01:49:32 +00009839PyObject *
9840PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009842 PyObject *res;
9843 PyObject *fseq;
9844 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009845 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009847 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009848 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009849 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009850 }
9851
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009852 /* NOTE: the following code can't call back into Python code,
9853 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009854 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009855
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009856 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009857 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009858 res = _PyUnicode_JoinArray(separator, items, seqlen);
9859 Py_DECREF(fseq);
9860 return res;
9861}
9862
9863PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009864_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009865{
9866 PyObject *res = NULL; /* the result */
9867 PyObject *sep = NULL;
9868 Py_ssize_t seplen;
9869 PyObject *item;
9870 Py_ssize_t sz, i, res_offset;
9871 Py_UCS4 maxchar;
9872 Py_UCS4 item_maxchar;
9873 int use_memcpy;
9874 unsigned char *res_data = NULL, *sep_data = NULL;
9875 PyObject *last_obj;
9876 unsigned int kind = 0;
9877
Tim Peters05eba1f2004-08-27 21:32:02 +00009878 /* If empty sequence, return u"". */
9879 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009880 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009881 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009882
Tim Peters05eba1f2004-08-27 21:32:02 +00009883 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009884 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009885 if (seqlen == 1) {
9886 if (PyUnicode_CheckExact(items[0])) {
9887 res = items[0];
9888 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009889 return res;
9890 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009891 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009892 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009893 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009894 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009895 /* Set up sep and seplen */
9896 if (separator == NULL) {
9897 /* fall back to a blank space separator */
9898 sep = PyUnicode_FromOrdinal(' ');
9899 if (!sep)
9900 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009901 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009902 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009903 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009904 else {
9905 if (!PyUnicode_Check(separator)) {
9906 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009907 "separator: expected str instance,"
9908 " %.80s found",
9909 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009910 goto onError;
9911 }
9912 if (PyUnicode_READY(separator))
9913 goto onError;
9914 sep = separator;
9915 seplen = PyUnicode_GET_LENGTH(separator);
9916 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9917 /* inc refcount to keep this code path symmetric with the
9918 above case of a blank separator */
9919 Py_INCREF(sep);
9920 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009921 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009922 }
9923
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 /* There are at least two things to join, or else we have a subclass
9925 * of str in the sequence.
9926 * Do a pre-pass to figure out the total amount of space we'll
9927 * need (sz), and see whether all argument are strings.
9928 */
9929 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009930#ifdef Py_DEBUG
9931 use_memcpy = 0;
9932#else
9933 use_memcpy = 1;
9934#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009936 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009937 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 if (!PyUnicode_Check(item)) {
9939 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009940 "sequence item %zd: expected str instance,"
9941 " %.80s found",
9942 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 goto onError;
9944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (PyUnicode_READY(item) == -1)
9946 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009947 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009949 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009950 if (i != 0) {
9951 add_sz += seplen;
9952 }
9953 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 goto onError;
9957 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009958 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009959 if (use_memcpy && last_obj != NULL) {
9960 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9961 use_memcpy = 0;
9962 }
9963 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009964 }
Tim Petersced69f82003-09-16 20:30:58 +00009965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009967 if (res == NULL)
9968 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009969
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009970 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009971#ifdef Py_DEBUG
9972 use_memcpy = 0;
9973#else
9974 if (use_memcpy) {
9975 res_data = PyUnicode_1BYTE_DATA(res);
9976 kind = PyUnicode_KIND(res);
9977 if (seplen != 0)
9978 sep_data = PyUnicode_1BYTE_DATA(sep);
9979 }
9980#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009981 if (use_memcpy) {
9982 for (i = 0; i < seqlen; ++i) {
9983 Py_ssize_t itemlen;
9984 item = items[i];
9985
9986 /* Copy item, and maybe the separator. */
9987 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009988 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009989 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009990 kind * seplen);
9991 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009992 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009993
9994 itemlen = PyUnicode_GET_LENGTH(item);
9995 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009996 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009997 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009998 kind * itemlen);
9999 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010000 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010001 }
10002 assert(res_data == PyUnicode_1BYTE_DATA(res)
10003 + kind * PyUnicode_GET_LENGTH(res));
10004 }
10005 else {
10006 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10007 Py_ssize_t itemlen;
10008 item = items[i];
10009
10010 /* Copy item, and maybe the separator. */
10011 if (i && seplen != 0) {
10012 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10013 res_offset += seplen;
10014 }
10015
10016 itemlen = PyUnicode_GET_LENGTH(item);
10017 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010018 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010019 res_offset += itemlen;
10020 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010021 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010022 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010023 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010026 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010031 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 return NULL;
10033}
10034
Victor Stinnerd3f08822012-05-29 12:57:52 +020010035void
10036_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10037 Py_UCS4 fill_char)
10038{
10039 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010040 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010041 assert(PyUnicode_IS_READY(unicode));
10042 assert(unicode_modifiable(unicode));
10043 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10044 assert(start >= 0);
10045 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010046 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010047}
10048
Victor Stinner3fe55312012-01-04 00:33:50 +010010049Py_ssize_t
10050PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10051 Py_UCS4 fill_char)
10052{
10053 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010054
10055 if (!PyUnicode_Check(unicode)) {
10056 PyErr_BadInternalCall();
10057 return -1;
10058 }
10059 if (PyUnicode_READY(unicode) == -1)
10060 return -1;
10061 if (unicode_check_modifiable(unicode))
10062 return -1;
10063
Victor Stinnerd3f08822012-05-29 12:57:52 +020010064 if (start < 0) {
10065 PyErr_SetString(PyExc_IndexError, "string index out of range");
10066 return -1;
10067 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010068 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10069 PyErr_SetString(PyExc_ValueError,
10070 "fill character is bigger than "
10071 "the string maximum character");
10072 return -1;
10073 }
10074
10075 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10076 length = Py_MIN(maxlen, length);
10077 if (length <= 0)
10078 return 0;
10079
Victor Stinnerd3f08822012-05-29 12:57:52 +020010080 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010081 return length;
10082}
10083
Victor Stinner9310abb2011-10-05 00:59:23 +020010084static PyObject *
10085pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010086 Py_ssize_t left,
10087 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 PyObject *u;
10091 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010092 int kind;
10093 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094
10095 if (left < 0)
10096 left = 0;
10097 if (right < 0)
10098 right = 0;
10099
Victor Stinnerc4b49542011-12-11 22:44:26 +010010100 if (left == 0 && right == 0)
10101 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10104 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010105 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10106 return NULL;
10107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010109 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010111 if (!u)
10112 return NULL;
10113
10114 kind = PyUnicode_KIND(u);
10115 data = PyUnicode_DATA(u);
10116 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010117 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010118 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010119 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010120 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010121 assert(_PyUnicode_CheckConsistency(u, 1));
10122 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123}
10124
Alexander Belopolsky40018472011-02-26 01:02:56 +000010125PyObject *
10126PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010130 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
Benjamin Petersonead6b532011-12-20 17:23:42 -060010133 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010135 if (PyUnicode_IS_ASCII(string))
10136 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010137 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010138 PyUnicode_GET_LENGTH(string), keepends);
10139 else
10140 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 break;
10144 case PyUnicode_2BYTE_KIND:
10145 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010146 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyUnicode_GET_LENGTH(string), keepends);
10148 break;
10149 case PyUnicode_4BYTE_KIND:
10150 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 PyUnicode_GET_LENGTH(string), keepends);
10153 break;
10154 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010155 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158}
10159
Alexander Belopolsky40018472011-02-26 01:02:56 +000010160static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010161split(PyObject *self,
10162 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010163 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010165 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 void *buf1, *buf2;
10167 Py_ssize_t len1, len2;
10168 PyObject* out;
10169
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010171 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (PyUnicode_READY(self) == -1)
10174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010177 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 if (PyUnicode_IS_ASCII(self))
10180 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 PyUnicode_GET_LENGTH(self), maxcount
10183 );
10184 else
10185 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 PyUnicode_GET_LENGTH(self), maxcount
10188 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 case PyUnicode_2BYTE_KIND:
10190 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 PyUnicode_GET_LENGTH(self), maxcount
10193 );
10194 case PyUnicode_4BYTE_KIND:
10195 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyUnicode_GET_LENGTH(self), maxcount
10198 );
10199 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010200 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 }
10202
10203 if (PyUnicode_READY(substring) == -1)
10204 return NULL;
10205
10206 kind1 = PyUnicode_KIND(self);
10207 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 len1 = PyUnicode_GET_LENGTH(self);
10209 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010210 if (kind1 < kind2 || len1 < len2) {
10211 out = PyList_New(1);
10212 if (out == NULL)
10213 return NULL;
10214 Py_INCREF(self);
10215 PyList_SET_ITEM(out, 0, self);
10216 return out;
10217 }
10218 buf1 = PyUnicode_DATA(self);
10219 buf2 = PyUnicode_DATA(substring);
10220 if (kind2 != kind1) {
10221 buf2 = _PyUnicode_AsKind(substring, kind1);
10222 if (!buf2)
10223 return NULL;
10224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010226 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10229 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 else
10232 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 break;
10235 case PyUnicode_2BYTE_KIND:
10236 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 break;
10239 case PyUnicode_4BYTE_KIND:
10240 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 break;
10243 default:
10244 out = NULL;
10245 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010246 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyMem_Free(buf2);
10248 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249}
10250
Alexander Belopolsky40018472011-02-26 01:02:56 +000010251static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010252rsplit(PyObject *self,
10253 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010254 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010255{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010256 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 void *buf1, *buf2;
10258 Py_ssize_t len1, len2;
10259 PyObject* out;
10260
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010261 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010262 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 if (PyUnicode_READY(self) == -1)
10265 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010268 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 if (PyUnicode_IS_ASCII(self))
10271 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 else
10276 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 PyUnicode_GET_LENGTH(self), maxcount
10279 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 case PyUnicode_2BYTE_KIND:
10281 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 PyUnicode_GET_LENGTH(self), maxcount
10284 );
10285 case PyUnicode_4BYTE_KIND:
10286 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 PyUnicode_GET_LENGTH(self), maxcount
10289 );
10290 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010291 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 }
10293
10294 if (PyUnicode_READY(substring) == -1)
10295 return NULL;
10296
10297 kind1 = PyUnicode_KIND(self);
10298 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 len1 = PyUnicode_GET_LENGTH(self);
10300 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010301 if (kind1 < kind2 || len1 < len2) {
10302 out = PyList_New(1);
10303 if (out == NULL)
10304 return NULL;
10305 Py_INCREF(self);
10306 PyList_SET_ITEM(out, 0, self);
10307 return out;
10308 }
10309 buf1 = PyUnicode_DATA(self);
10310 buf2 = PyUnicode_DATA(substring);
10311 if (kind2 != kind1) {
10312 buf2 = _PyUnicode_AsKind(substring, kind1);
10313 if (!buf2)
10314 return NULL;
10315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010317 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010319 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10320 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010321 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010322 else
10323 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 break;
10326 case PyUnicode_2BYTE_KIND:
10327 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010328 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 break;
10330 case PyUnicode_4BYTE_KIND:
10331 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 break;
10334 default:
10335 out = NULL;
10336 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010337 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyMem_Free(buf2);
10339 return out;
10340}
10341
10342static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010343anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10344 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010346 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10349 return asciilib_find(buf1, len1, buf2, len2, offset);
10350 else
10351 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 case PyUnicode_2BYTE_KIND:
10353 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10354 case PyUnicode_4BYTE_KIND:
10355 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10356 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010357 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358}
10359
10360static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10362 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010364 switch (kind) {
10365 case PyUnicode_1BYTE_KIND:
10366 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10367 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10368 else
10369 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10370 case PyUnicode_2BYTE_KIND:
10371 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10372 case PyUnicode_4BYTE_KIND:
10373 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10374 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010375 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010376}
10377
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010378static void
10379replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10380 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10381{
10382 int kind = PyUnicode_KIND(u);
10383 void *data = PyUnicode_DATA(u);
10384 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10385 if (kind == PyUnicode_1BYTE_KIND) {
10386 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10387 (Py_UCS1 *)data + len,
10388 u1, u2, maxcount);
10389 }
10390 else if (kind == PyUnicode_2BYTE_KIND) {
10391 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10392 (Py_UCS2 *)data + len,
10393 u1, u2, maxcount);
10394 }
10395 else {
10396 assert(kind == PyUnicode_4BYTE_KIND);
10397 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10398 (Py_UCS4 *)data + len,
10399 u1, u2, maxcount);
10400 }
10401}
10402
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404replace(PyObject *self, PyObject *str1,
10405 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyObject *u;
10408 char *sbuf = PyUnicode_DATA(self);
10409 char *buf1 = PyUnicode_DATA(str1);
10410 char *buf2 = PyUnicode_DATA(str2);
10411 int srelease = 0, release1 = 0, release2 = 0;
10412 int skind = PyUnicode_KIND(self);
10413 int kind1 = PyUnicode_KIND(str1);
10414 int kind2 = PyUnicode_KIND(str2);
10415 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10416 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10417 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010419 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
10421 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010424 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425
Victor Stinner59de0ee2011-10-07 10:01:28 +020010426 if (str1 == str2)
10427 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428
Victor Stinner49a0a212011-10-12 23:46:10 +020010429 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010430 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10431 if (maxchar < maxchar_str1)
10432 /* substring too wide to be present */
10433 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010434 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10435 /* Replacing str1 with str2 may cause a maxchar reduction in the
10436 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010437 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010438 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010443 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010446 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010447 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010448
Victor Stinner69ed0f42013-04-09 21:48:24 +020010449 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010451 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010453 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010457
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10459 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010460 }
10461 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 int rkind = skind;
10463 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010464 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (kind1 < rkind) {
10467 /* widen substring */
10468 buf1 = _PyUnicode_AsKind(str1, rkind);
10469 if (!buf1) goto error;
10470 release1 = 1;
10471 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010472 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 if (i < 0)
10474 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (rkind > kind2) {
10476 /* widen replacement */
10477 buf2 = _PyUnicode_AsKind(str2, rkind);
10478 if (!buf2) goto error;
10479 release2 = 1;
10480 }
10481 else if (rkind < kind2) {
10482 /* widen self and buf1 */
10483 rkind = kind2;
10484 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010485 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 sbuf = _PyUnicode_AsKind(self, rkind);
10487 if (!sbuf) goto error;
10488 srelease = 1;
10489 buf1 = _PyUnicode_AsKind(str1, rkind);
10490 if (!buf1) goto error;
10491 release1 = 1;
10492 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010493 u = PyUnicode_New(slen, maxchar);
10494 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 assert(PyUnicode_KIND(u) == rkind);
10497 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010498
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010499 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010500 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010501 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010503 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010505
10506 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010507 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010508 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010509 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010510 if (i == -1)
10511 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010514 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 }
10519 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010521 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 int rkind = skind;
10523 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 buf1 = _PyUnicode_AsKind(str1, rkind);
10528 if (!buf1) goto error;
10529 release1 = 1;
10530 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 if (n == 0)
10533 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010535 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 buf2 = _PyUnicode_AsKind(str2, rkind);
10537 if (!buf2) goto error;
10538 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 rkind = kind2;
10543 sbuf = _PyUnicode_AsKind(self, rkind);
10544 if (!sbuf) goto error;
10545 srelease = 1;
10546 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010547 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 buf1 = _PyUnicode_AsKind(str1, rkind);
10549 if (!buf1) goto error;
10550 release1 = 1;
10551 }
10552 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10553 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010554 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 PyErr_SetString(PyExc_OverflowError,
10556 "replace string is too long");
10557 goto error;
10558 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010559 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010560 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010561 _Py_INCREF_UNICODE_EMPTY();
10562 if (!unicode_empty)
10563 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 u = unicode_empty;
10565 goto done;
10566 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010567 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 PyErr_SetString(PyExc_OverflowError,
10569 "replace string is too long");
10570 goto error;
10571 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 u = PyUnicode_New(new_size, maxchar);
10573 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 assert(PyUnicode_KIND(u) == rkind);
10576 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 ires = i = 0;
10578 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010579 while (n-- > 0) {
10580 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010583 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010584 if (j == -1)
10585 break;
10586 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 memcpy(res + rkind * ires,
10589 sbuf + rkind * i,
10590 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 }
10593 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010604 memcpy(res + rkind * ires,
10605 sbuf + rkind * i,
10606 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 }
10608 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 /* interleave */
10610 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010611 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010613 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010615 if (--n <= 0)
10616 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * ires,
10618 sbuf + rkind * i,
10619 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 ires++;
10621 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 memcpy(res + rkind * ires,
10624 sbuf + rkind * i,
10625 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 }
10628
10629 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010630 unicode_adjust_maxchar(&u);
10631 if (u == NULL)
10632 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010634
10635 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (srelease)
10637 PyMem_FREE(sbuf);
10638 if (release1)
10639 PyMem_FREE(buf1);
10640 if (release2)
10641 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010642 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644
Benjamin Peterson29060642009-01-31 22:14:21 +000010645 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (srelease)
10648 PyMem_FREE(sbuf);
10649 if (release1)
10650 PyMem_FREE(buf1);
10651 if (release2)
10652 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010653 return unicode_result_unchanged(self);
10654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 error:
10656 if (srelease && sbuf)
10657 PyMem_FREE(sbuf);
10658 if (release1 && buf1)
10659 PyMem_FREE(buf1);
10660 if (release2 && buf2)
10661 PyMem_FREE(buf2);
10662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663}
10664
10665/* --- Unicode Object Methods --------------------------------------------- */
10666
INADA Naoki3ae20562017-01-16 20:41:20 +090010667/*[clinic input]
10668str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
INADA Naoki3ae20562017-01-16 20:41:20 +090010670Return a version of the string where each word is titlecased.
10671
10672More specifically, words start with uppercased characters and all remaining
10673cased characters have lower case.
10674[clinic start generated code]*/
10675
10676static PyObject *
10677unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010678/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010680 if (PyUnicode_READY(self) == -1)
10681 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010682 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683}
10684
INADA Naoki3ae20562017-01-16 20:41:20 +090010685/*[clinic input]
10686str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687
INADA Naoki3ae20562017-01-16 20:41:20 +090010688Return a capitalized version of the string.
10689
10690More specifically, make the first character have upper case and the rest lower
10691case.
10692[clinic start generated code]*/
10693
10694static PyObject *
10695unicode_capitalize_impl(PyObject *self)
10696/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010698 if (PyUnicode_READY(self) == -1)
10699 return NULL;
10700 if (PyUnicode_GET_LENGTH(self) == 0)
10701 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010702 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703}
10704
INADA Naoki3ae20562017-01-16 20:41:20 +090010705/*[clinic input]
10706str.casefold as unicode_casefold
10707
10708Return a version of the string suitable for caseless comparisons.
10709[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010710
10711static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010712unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010713/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010714{
10715 if (PyUnicode_READY(self) == -1)
10716 return NULL;
10717 if (PyUnicode_IS_ASCII(self))
10718 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010719 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010720}
10721
10722
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010723/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010724
10725static int
10726convert_uc(PyObject *obj, void *addr)
10727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010729
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010730 if (!PyUnicode_Check(obj)) {
10731 PyErr_Format(PyExc_TypeError,
10732 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010733 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010734 return 0;
10735 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010736 if (PyUnicode_READY(obj) < 0)
10737 return 0;
10738 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010739 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 return 0;
10742 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010743 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010744 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010745}
10746
INADA Naoki3ae20562017-01-16 20:41:20 +090010747/*[clinic input]
10748str.center as unicode_center
10749
10750 width: Py_ssize_t
10751 fillchar: Py_UCS4 = ' '
10752 /
10753
10754Return a centered string of length width.
10755
10756Padding is done using the specified fill character (default is a space).
10757[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758
10759static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010760unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10761/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010763 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
Benjamin Petersonbac79492012-01-14 13:34:47 -050010765 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 return NULL;
10767
Victor Stinnerc4b49542011-12-11 22:44:26 +010010768 if (PyUnicode_GET_LENGTH(self) >= width)
10769 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Victor Stinnerc4b49542011-12-11 22:44:26 +010010771 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 left = marg / 2 + (marg & width & 1);
10773
Victor Stinner9310abb2011-10-05 00:59:23 +020010774 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777/* This function assumes that str1 and str2 are readied by the caller. */
10778
Marc-André Lemburge5034372000-08-08 08:04:29 +000010779static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010780unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010781{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010782#define COMPARE(TYPE1, TYPE2) \
10783 do { \
10784 TYPE1* p1 = (TYPE1 *)data1; \
10785 TYPE2* p2 = (TYPE2 *)data2; \
10786 TYPE1* end = p1 + len; \
10787 Py_UCS4 c1, c2; \
10788 for (; p1 != end; p1++, p2++) { \
10789 c1 = *p1; \
10790 c2 = *p2; \
10791 if (c1 != c2) \
10792 return (c1 < c2) ? -1 : 1; \
10793 } \
10794 } \
10795 while (0)
10796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 int kind1, kind2;
10798 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010799 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 kind1 = PyUnicode_KIND(str1);
10802 kind2 = PyUnicode_KIND(str2);
10803 data1 = PyUnicode_DATA(str1);
10804 data2 = PyUnicode_DATA(str2);
10805 len1 = PyUnicode_GET_LENGTH(str1);
10806 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010807 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010808
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010809 switch(kind1) {
10810 case PyUnicode_1BYTE_KIND:
10811 {
10812 switch(kind2) {
10813 case PyUnicode_1BYTE_KIND:
10814 {
10815 int cmp = memcmp(data1, data2, len);
10816 /* normalize result of memcmp() into the range [-1; 1] */
10817 if (cmp < 0)
10818 return -1;
10819 if (cmp > 0)
10820 return 1;
10821 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010822 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010823 case PyUnicode_2BYTE_KIND:
10824 COMPARE(Py_UCS1, Py_UCS2);
10825 break;
10826 case PyUnicode_4BYTE_KIND:
10827 COMPARE(Py_UCS1, Py_UCS4);
10828 break;
10829 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010830 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010831 }
10832 break;
10833 }
10834 case PyUnicode_2BYTE_KIND:
10835 {
10836 switch(kind2) {
10837 case PyUnicode_1BYTE_KIND:
10838 COMPARE(Py_UCS2, Py_UCS1);
10839 break;
10840 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010841 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842 COMPARE(Py_UCS2, Py_UCS2);
10843 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010844 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010845 case PyUnicode_4BYTE_KIND:
10846 COMPARE(Py_UCS2, Py_UCS4);
10847 break;
10848 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010849 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850 }
10851 break;
10852 }
10853 case PyUnicode_4BYTE_KIND:
10854 {
10855 switch(kind2) {
10856 case PyUnicode_1BYTE_KIND:
10857 COMPARE(Py_UCS4, Py_UCS1);
10858 break;
10859 case PyUnicode_2BYTE_KIND:
10860 COMPARE(Py_UCS4, Py_UCS2);
10861 break;
10862 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010863 {
10864#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10865 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10866 /* normalize result of wmemcmp() into the range [-1; 1] */
10867 if (cmp < 0)
10868 return -1;
10869 if (cmp > 0)
10870 return 1;
10871#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010872 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010873#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010874 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010875 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010876 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010877 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 }
10879 break;
10880 }
10881 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010882 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010883 }
10884
Victor Stinner770e19e2012-10-04 22:59:45 +020010885 if (len1 == len2)
10886 return 0;
10887 if (len1 < len2)
10888 return -1;
10889 else
10890 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891
10892#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010893}
10894
Benjamin Peterson621b4302016-09-09 13:54:34 -070010895static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010896unicode_compare_eq(PyObject *str1, PyObject *str2)
10897{
10898 int kind;
10899 void *data1, *data2;
10900 Py_ssize_t len;
10901 int cmp;
10902
Victor Stinnere5567ad2012-10-23 02:48:49 +020010903 len = PyUnicode_GET_LENGTH(str1);
10904 if (PyUnicode_GET_LENGTH(str2) != len)
10905 return 0;
10906 kind = PyUnicode_KIND(str1);
10907 if (PyUnicode_KIND(str2) != kind)
10908 return 0;
10909 data1 = PyUnicode_DATA(str1);
10910 data2 = PyUnicode_DATA(str2);
10911
10912 cmp = memcmp(data1, data2, len * kind);
10913 return (cmp == 0);
10914}
10915
10916
Alexander Belopolsky40018472011-02-26 01:02:56 +000010917int
10918PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10921 if (PyUnicode_READY(left) == -1 ||
10922 PyUnicode_READY(right) == -1)
10923 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010924
10925 /* a string is equal to itself */
10926 if (left == right)
10927 return 0;
10928
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010929 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010931 PyErr_Format(PyExc_TypeError,
10932 "Can't compare %.100s and %.100s",
10933 left->ob_type->tp_name,
10934 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 return -1;
10936}
10937
Martin v. Löwis5b222132007-06-10 09:51:05 +000010938int
10939PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 Py_ssize_t i;
10942 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010944 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945
Victor Stinner910337b2011-10-03 03:20:16 +020010946 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010947 if (!PyUnicode_IS_READY(uni)) {
10948 const wchar_t *ws = _PyUnicode_WSTR(uni);
10949 /* Compare Unicode string and source character set string */
10950 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10951 if (chr != ustr[i])
10952 return (chr < ustr[i]) ? -1 : 1;
10953 }
10954 /* This check keeps Python strings that end in '\0' from comparing equal
10955 to C strings identical up to that point. */
10956 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10957 return 1; /* uni is longer */
10958 if (ustr[i])
10959 return -1; /* str is longer */
10960 return 0;
10961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010963 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010964 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010965 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010966 size_t len, len2 = strlen(str);
10967 int cmp;
10968
10969 len = Py_MIN(len1, len2);
10970 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010971 if (cmp != 0) {
10972 if (cmp < 0)
10973 return -1;
10974 else
10975 return 1;
10976 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010977 if (len1 > len2)
10978 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010979 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010980 return -1; /* str is longer */
10981 return 0;
10982 }
10983 else {
10984 void *data = PyUnicode_DATA(uni);
10985 /* Compare Unicode string and source character set string */
10986 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010987 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010988 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10989 /* This check keeps Python strings that end in '\0' from comparing equal
10990 to C strings identical up to that point. */
10991 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10992 return 1; /* uni is longer */
10993 if (str[i])
10994 return -1; /* str is longer */
10995 return 0;
10996 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010997}
10998
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010999static int
11000non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11001{
11002 size_t i, len;
11003 const wchar_t *p;
11004 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11005 if (strlen(str) != len)
11006 return 0;
11007 p = _PyUnicode_WSTR(unicode);
11008 assert(p);
11009 for (i = 0; i < len; i++) {
11010 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011011 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011012 return 0;
11013 }
11014 return 1;
11015}
11016
11017int
11018_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11019{
11020 size_t len;
11021 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011022 assert(str);
11023#ifndef NDEBUG
11024 for (const char *p = str; *p; p++) {
11025 assert((unsigned char)*p < 128);
11026 }
11027#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011028 if (PyUnicode_READY(unicode) == -1) {
11029 /* Memory error or bad data */
11030 PyErr_Clear();
11031 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11032 }
11033 if (!PyUnicode_IS_ASCII(unicode))
11034 return 0;
11035 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11036 return strlen(str) == len &&
11037 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11038}
11039
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011040int
11041_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11042{
11043 PyObject *right_uni;
11044 Py_hash_t hash;
11045
11046 assert(_PyUnicode_CHECK(left));
11047 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011048#ifndef NDEBUG
11049 for (const char *p = right->string; *p; p++) {
11050 assert((unsigned char)*p < 128);
11051 }
11052#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011053
11054 if (PyUnicode_READY(left) == -1) {
11055 /* memory error or bad data */
11056 PyErr_Clear();
11057 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11058 }
11059
11060 if (!PyUnicode_IS_ASCII(left))
11061 return 0;
11062
11063 right_uni = _PyUnicode_FromId(right); /* borrowed */
11064 if (right_uni == NULL) {
11065 /* memory error or bad data */
11066 PyErr_Clear();
11067 return _PyUnicode_EqualToASCIIString(left, right->string);
11068 }
11069
11070 if (left == right_uni)
11071 return 1;
11072
11073 if (PyUnicode_CHECK_INTERNED(left))
11074 return 0;
11075
INADA Naoki7cc95f52018-01-28 02:07:09 +090011076 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011077 hash = _PyUnicode_HASH(left);
11078 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11079 return 0;
11080
11081 return unicode_compare_eq(left, right_uni);
11082}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011083
Alexander Belopolsky40018472011-02-26 01:02:56 +000011084PyObject *
11085PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011086{
11087 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011088
Victor Stinnere5567ad2012-10-23 02:48:49 +020011089 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11090 Py_RETURN_NOTIMPLEMENTED;
11091
11092 if (PyUnicode_READY(left) == -1 ||
11093 PyUnicode_READY(right) == -1)
11094 return NULL;
11095
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011096 if (left == right) {
11097 switch (op) {
11098 case Py_EQ:
11099 case Py_LE:
11100 case Py_GE:
11101 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011102 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011103 case Py_NE:
11104 case Py_LT:
11105 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011106 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011107 default:
11108 PyErr_BadArgument();
11109 return NULL;
11110 }
11111 }
11112 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011113 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011114 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011115 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011116 }
11117 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011118 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011119 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011120 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011121}
11122
Alexander Belopolsky40018472011-02-26 01:02:56 +000011123int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011124_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11125{
11126 return unicode_eq(aa, bb);
11127}
11128
11129int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011130PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011131{
Victor Stinner77282cb2013-04-14 19:22:47 +020011132 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 void *buf1, *buf2;
11134 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011135 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011136
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011137 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011139 "'in <string>' requires string as left operand, not %.100s",
11140 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011142 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011143 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011144 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145 if (ensure_unicode(str) < 0)
11146 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 kind2 = PyUnicode_KIND(substr);
11150 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011151 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011153 len2 = PyUnicode_GET_LENGTH(substr);
11154 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011155 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011156 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011157 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011158 if (len2 == 1) {
11159 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11160 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 return result;
11162 }
11163 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011164 buf2 = _PyUnicode_AsKind(substr, kind1);
11165 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168
Victor Stinner77282cb2013-04-14 19:22:47 +020011169 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 case PyUnicode_1BYTE_KIND:
11171 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11172 break;
11173 case PyUnicode_2BYTE_KIND:
11174 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11175 break;
11176 case PyUnicode_4BYTE_KIND:
11177 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11178 break;
11179 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011180 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
Victor Stinner77282cb2013-04-14 19:22:47 +020011183 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 PyMem_Free(buf2);
11185
Guido van Rossum403d68b2000-03-13 15:55:09 +000011186 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011187}
11188
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189/* Concat to string or Unicode object giving a new Unicode object. */
11190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191PyObject *
11192PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011194 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011195 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011196 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011198 if (ensure_unicode(left) < 0)
11199 return NULL;
11200
11201 if (!PyUnicode_Check(right)) {
11202 PyErr_Format(PyExc_TypeError,
11203 "can only concatenate str (not \"%.200s\") to str",
11204 right->ob_type->tp_name);
11205 return NULL;
11206 }
11207 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209
11210 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011211 if (left == unicode_empty)
11212 return PyUnicode_FromObject(right);
11213 if (right == unicode_empty)
11214 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216 left_len = PyUnicode_GET_LENGTH(left);
11217 right_len = PyUnicode_GET_LENGTH(right);
11218 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011219 PyErr_SetString(PyExc_OverflowError,
11220 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011222 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011224
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11226 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011227 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011230 result = PyUnicode_New(new_len, maxchar);
11231 if (result == NULL)
11232 return NULL;
11233 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11234 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11235 assert(_PyUnicode_CheckConsistency(result, 1));
11236 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237}
11238
Walter Dörwald1ab83302007-05-18 17:15:44 +000011239void
Victor Stinner23e56682011-10-03 03:54:37 +020011240PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011241{
Victor Stinner23e56682011-10-03 03:54:37 +020011242 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011243 Py_UCS4 maxchar, maxchar2;
11244 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011245
11246 if (p_left == NULL) {
11247 if (!PyErr_Occurred())
11248 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249 return;
11250 }
Victor Stinner23e56682011-10-03 03:54:37 +020011251 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011252 if (right == NULL || left == NULL
11253 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011254 if (!PyErr_Occurred())
11255 PyErr_BadInternalCall();
11256 goto error;
11257 }
11258
Benjamin Petersonbac79492012-01-14 13:34:47 -050011259 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011260 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011261 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011262 goto error;
11263
Victor Stinner488fa492011-12-12 00:01:39 +010011264 /* Shortcuts */
11265 if (left == unicode_empty) {
11266 Py_DECREF(left);
11267 Py_INCREF(right);
11268 *p_left = right;
11269 return;
11270 }
11271 if (right == unicode_empty)
11272 return;
11273
11274 left_len = PyUnicode_GET_LENGTH(left);
11275 right_len = PyUnicode_GET_LENGTH(right);
11276 if (left_len > PY_SSIZE_T_MAX - right_len) {
11277 PyErr_SetString(PyExc_OverflowError,
11278 "strings are too large to concat");
11279 goto error;
11280 }
11281 new_len = left_len + right_len;
11282
11283 if (unicode_modifiable(left)
11284 && PyUnicode_CheckExact(right)
11285 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011286 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11287 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011288 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011289 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011290 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11291 {
11292 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011293 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011294 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011295
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011296 /* copy 'right' into the newly allocated area of 'left' */
11297 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011298 }
Victor Stinner488fa492011-12-12 00:01:39 +010011299 else {
11300 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11301 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011302 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011303
Victor Stinner488fa492011-12-12 00:01:39 +010011304 /* Concat the two Unicode strings */
11305 res = PyUnicode_New(new_len, maxchar);
11306 if (res == NULL)
11307 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011308 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11309 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011310 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011311 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011312 }
11313 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011314 return;
11315
11316error:
Victor Stinner488fa492011-12-12 00:01:39 +010011317 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011318}
11319
11320void
11321PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011323 PyUnicode_Append(pleft, right);
11324 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011325}
11326
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011327/*
11328Wraps stringlib_parse_args_finds() and additionally ensures that the
11329first argument is a unicode object.
11330*/
11331
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011332static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011333parse_args_finds_unicode(const char * function_name, PyObject *args,
11334 PyObject **substring,
11335 Py_ssize_t *start, Py_ssize_t *end)
11336{
11337 if(stringlib_parse_args_finds(function_name, args, substring,
11338 start, end)) {
11339 if (ensure_unicode(*substring) < 0)
11340 return 0;
11341 return 1;
11342 }
11343 return 0;
11344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011349Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011350string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
11353static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011354unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011356 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011357 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011358 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011360 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 void *buf1, *buf2;
11362 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011364 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 kind1 = PyUnicode_KIND(self);
11368 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011370 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 len1 = PyUnicode_GET_LENGTH(self);
11373 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011376 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011378 buf1 = PyUnicode_DATA(self);
11379 buf2 = PyUnicode_DATA(substring);
11380 if (kind2 != kind1) {
11381 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011383 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 }
11385 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 case PyUnicode_1BYTE_KIND:
11387 iresult = ucs1lib_count(
11388 ((Py_UCS1*)buf1) + start, end - start,
11389 buf2, len2, PY_SSIZE_T_MAX
11390 );
11391 break;
11392 case PyUnicode_2BYTE_KIND:
11393 iresult = ucs2lib_count(
11394 ((Py_UCS2*)buf1) + start, end - start,
11395 buf2, len2, PY_SSIZE_T_MAX
11396 );
11397 break;
11398 case PyUnicode_4BYTE_KIND:
11399 iresult = ucs4lib_count(
11400 ((Py_UCS4*)buf1) + start, end - start,
11401 buf2, len2, PY_SSIZE_T_MAX
11402 );
11403 break;
11404 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011405 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 }
11407
11408 result = PyLong_FromSsize_t(iresult);
11409
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011410 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413 return result;
11414}
11415
INADA Naoki3ae20562017-01-16 20:41:20 +090011416/*[clinic input]
11417str.encode as unicode_encode
11418
11419 encoding: str(c_default="NULL") = 'utf-8'
11420 The encoding in which to encode the string.
11421 errors: str(c_default="NULL") = 'strict'
11422 The error handling scheme to use for encoding errors.
11423 The default is 'strict' meaning that encoding errors raise a
11424 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11425 'xmlcharrefreplace' as well as any other name registered with
11426 codecs.register_error that can handle UnicodeEncodeErrors.
11427
11428Encode the string using the codec registered for encoding.
11429[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011432unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011433/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011435 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011436}
11437
INADA Naoki3ae20562017-01-16 20:41:20 +090011438/*[clinic input]
11439str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
INADA Naoki3ae20562017-01-16 20:41:20 +090011441 tabsize: int = 8
11442
11443Return a copy where all tab characters are expanded using spaces.
11444
11445If tabsize is not given, a tab size of 8 characters is assumed.
11446[clinic start generated code]*/
11447
11448static PyObject *
11449unicode_expandtabs_impl(PyObject *self, int tabsize)
11450/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011452 Py_ssize_t i, j, line_pos, src_len, incr;
11453 Py_UCS4 ch;
11454 PyObject *u;
11455 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011457 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
Antoine Pitrou22425222011-10-04 19:10:51 +020011459 if (PyUnicode_READY(self) == -1)
11460 return NULL;
11461
Thomas Wouters7e474022000-07-16 12:04:32 +000011462 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 src_len = PyUnicode_GET_LENGTH(self);
11464 i = j = line_pos = 0;
11465 kind = PyUnicode_KIND(self);
11466 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011467 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011468 for (; i < src_len; i++) {
11469 ch = PyUnicode_READ(kind, src_data, i);
11470 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011471 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011473 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011475 goto overflow;
11476 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011478 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011482 goto overflow;
11483 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011485 if (ch == '\n' || ch == '\r')
11486 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011489 if (!found)
11490 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 if (!u)
11495 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Antoine Pitroue71d5742011-10-04 15:55:09 +020011498 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
Antoine Pitroue71d5742011-10-04 15:55:09 +020011500 for (; i < src_len; i++) {
11501 ch = PyUnicode_READ(kind, src_data, i);
11502 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 incr = tabsize - (line_pos % tabsize);
11505 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011506 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011507 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011509 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 line_pos++;
11512 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011513 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011514 if (ch == '\n' || ch == '\r')
11515 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 }
11518 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011519 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011520
Antoine Pitroue71d5742011-10-04 15:55:09 +020011521 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011522 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524}
11525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528\n\
11529Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011530such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531arguments start and end are interpreted as in slice notation.\n\
11532\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011538 /* initialize variables to prevent gcc warning */
11539 PyObject *substring = NULL;
11540 Py_ssize_t start = 0;
11541 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011544 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011547 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011550 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (result == -2)
11553 return NULL;
11554
Christian Heimes217cfd12007-12-02 14:31:20 +000011555 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
11558static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011559unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011561 void *data;
11562 enum PyUnicode_Kind kind;
11563 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011564
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011565 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011566 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011568 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011569 if (PyUnicode_READY(self) == -1) {
11570 return NULL;
11571 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011572 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11573 PyErr_SetString(PyExc_IndexError, "string index out of range");
11574 return NULL;
11575 }
11576 kind = PyUnicode_KIND(self);
11577 data = PyUnicode_DATA(self);
11578 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011579 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580}
11581
Guido van Rossumc2504932007-09-18 19:42:40 +000011582/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011583 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011584static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011585unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011587 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011588
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011589#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011590 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011591#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (_PyUnicode_HASH(self) != -1)
11593 return _PyUnicode_HASH(self);
11594 if (PyUnicode_READY(self) == -1)
11595 return -1;
animalizea1d14252019-01-02 20:16:06 +080011596
Christian Heimes985ecdc2013-11-20 11:46:18 +010011597 x = _Py_HashBytes(PyUnicode_DATA(self),
11598 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011600 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601}
11602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605\n\
oldkaa0735f2018-02-02 16:52:55 +080011606Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011607such that sub is contained within S[start:end]. Optional\n\
11608arguments start and end are interpreted as in slice notation.\n\
11609\n\
11610Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
11612static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011615 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011616 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011617 PyObject *substring = NULL;
11618 Py_ssize_t start = 0;
11619 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011621 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011624 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011627 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 if (result == -2)
11630 return NULL;
11631
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 if (result < 0) {
11633 PyErr_SetString(PyExc_ValueError, "substring not found");
11634 return NULL;
11635 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011636
Christian Heimes217cfd12007-12-02 14:31:20 +000011637 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638}
11639
INADA Naoki3ae20562017-01-16 20:41:20 +090011640/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011641str.isascii as unicode_isascii
11642
11643Return True if all characters in the string are ASCII, False otherwise.
11644
11645ASCII characters have code points in the range U+0000-U+007F.
11646Empty string is ASCII too.
11647[clinic start generated code]*/
11648
11649static PyObject *
11650unicode_isascii_impl(PyObject *self)
11651/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11652{
11653 if (PyUnicode_READY(self) == -1) {
11654 return NULL;
11655 }
11656 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11657}
11658
11659/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011660str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
INADA Naoki3ae20562017-01-16 20:41:20 +090011662Return True if the string is a lowercase string, False otherwise.
11663
11664A string is lowercase if all cased characters in the string are lowercase and
11665there is at least one cased character in the string.
11666[clinic start generated code]*/
11667
11668static PyObject *
11669unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011670/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 Py_ssize_t i, length;
11673 int kind;
11674 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675 int cased;
11676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (PyUnicode_READY(self) == -1)
11678 return NULL;
11679 length = PyUnicode_GET_LENGTH(self);
11680 kind = PyUnicode_KIND(self);
11681 data = PyUnicode_DATA(self);
11682
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (length == 1)
11685 return PyBool_FromLong(
11686 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011688 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011690 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011691
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 for (i = 0; i < length; i++) {
11694 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011695
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011697 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 else if (!cased && Py_UNICODE_ISLOWER(ch))
11699 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011701 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702}
11703
INADA Naoki3ae20562017-01-16 20:41:20 +090011704/*[clinic input]
11705str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
INADA Naoki3ae20562017-01-16 20:41:20 +090011707Return True if the string is an uppercase string, False otherwise.
11708
11709A string is uppercase if all cased characters in the string are uppercase and
11710there is at least one cased character in the string.
11711[clinic start generated code]*/
11712
11713static PyObject *
11714unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011715/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 Py_ssize_t i, length;
11718 int kind;
11719 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 int cased;
11721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 if (PyUnicode_READY(self) == -1)
11723 return NULL;
11724 length = PyUnicode_GET_LENGTH(self);
11725 kind = PyUnicode_KIND(self);
11726 data = PyUnicode_DATA(self);
11727
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 if (length == 1)
11730 return PyBool_FromLong(
11731 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011733 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011735 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011736
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 for (i = 0; i < length; i++) {
11739 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011740
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011742 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 else if (!cased && Py_UNICODE_ISUPPER(ch))
11744 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011746 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
INADA Naoki3ae20562017-01-16 20:41:20 +090011749/*[clinic input]
11750str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
INADA Naoki3ae20562017-01-16 20:41:20 +090011752Return True if the string is a title-cased string, False otherwise.
11753
11754In a title-cased string, upper- and title-case characters may only
11755follow uncased characters and lowercase characters only cased ones.
11756[clinic start generated code]*/
11757
11758static PyObject *
11759unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011760/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 Py_ssize_t i, length;
11763 int kind;
11764 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 int cased, previous_is_cased;
11766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (PyUnicode_READY(self) == -1)
11768 return NULL;
11769 length = PyUnicode_GET_LENGTH(self);
11770 kind = PyUnicode_KIND(self);
11771 data = PyUnicode_DATA(self);
11772
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (length == 1) {
11775 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11776 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11777 (Py_UNICODE_ISUPPER(ch) != 0));
11778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011780 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011782 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 cased = 0;
11785 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 for (i = 0; i < length; i++) {
11787 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011788
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11790 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011791 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 previous_is_cased = 1;
11793 cased = 1;
11794 }
11795 else if (Py_UNICODE_ISLOWER(ch)) {
11796 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011797 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 previous_is_cased = 1;
11799 cased = 1;
11800 }
11801 else
11802 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011804 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805}
11806
INADA Naoki3ae20562017-01-16 20:41:20 +090011807/*[clinic input]
11808str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
INADA Naoki3ae20562017-01-16 20:41:20 +090011810Return True if the string is a whitespace string, False otherwise.
11811
11812A string is whitespace if all characters in the string are whitespace and there
11813is at least one character in the string.
11814[clinic start generated code]*/
11815
11816static PyObject *
11817unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011818/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 Py_ssize_t i, length;
11821 int kind;
11822 void *data;
11823
11824 if (PyUnicode_READY(self) == -1)
11825 return NULL;
11826 length = PyUnicode_GET_LENGTH(self);
11827 kind = PyUnicode_KIND(self);
11828 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (length == 1)
11832 return PyBool_FromLong(
11833 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011835 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011837 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 for (i = 0; i < length; i++) {
11840 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011841 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011842 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011844 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845}
11846
INADA Naoki3ae20562017-01-16 20:41:20 +090011847/*[clinic input]
11848str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849
INADA Naoki3ae20562017-01-16 20:41:20 +090011850Return True if the string is an alphabetic string, False otherwise.
11851
11852A string is alphabetic if all characters in the string are alphabetic and there
11853is at least one character in the string.
11854[clinic start generated code]*/
11855
11856static PyObject *
11857unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011858/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 Py_ssize_t i, length;
11861 int kind;
11862 void *data;
11863
11864 if (PyUnicode_READY(self) == -1)
11865 return NULL;
11866 length = PyUnicode_GET_LENGTH(self);
11867 kind = PyUnicode_KIND(self);
11868 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (length == 1)
11872 return PyBool_FromLong(
11873 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011874
11875 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011877 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 for (i = 0; i < length; i++) {
11880 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011881 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011882 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011883 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011884}
11885
INADA Naoki3ae20562017-01-16 20:41:20 +090011886/*[clinic input]
11887str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011888
INADA Naoki3ae20562017-01-16 20:41:20 +090011889Return True if the string is an alpha-numeric string, False otherwise.
11890
11891A string is alpha-numeric if all characters in the string are alpha-numeric and
11892there is at least one character in the string.
11893[clinic start generated code]*/
11894
11895static PyObject *
11896unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011897/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 int kind;
11900 void *data;
11901 Py_ssize_t len, i;
11902
11903 if (PyUnicode_READY(self) == -1)
11904 return NULL;
11905
11906 kind = PyUnicode_KIND(self);
11907 data = PyUnicode_DATA(self);
11908 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011909
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011910 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (len == 1) {
11912 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11913 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11914 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011915
11916 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011918 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 for (i = 0; i < len; i++) {
11921 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011922 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011923 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011924 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011925 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011926}
11927
INADA Naoki3ae20562017-01-16 20:41:20 +090011928/*[clinic input]
11929str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
INADA Naoki3ae20562017-01-16 20:41:20 +090011931Return True if the string is a decimal string, False otherwise.
11932
11933A string is a decimal string if all characters in the string are decimal and
11934there is at least one character in the string.
11935[clinic start generated code]*/
11936
11937static PyObject *
11938unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011939/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 Py_ssize_t i, length;
11942 int kind;
11943 void *data;
11944
11945 if (PyUnicode_READY(self) == -1)
11946 return NULL;
11947 length = PyUnicode_GET_LENGTH(self);
11948 kind = PyUnicode_KIND(self);
11949 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (length == 1)
11953 return PyBool_FromLong(
11954 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011956 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011958 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 for (i = 0; i < length; i++) {
11961 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011962 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965}
11966
INADA Naoki3ae20562017-01-16 20:41:20 +090011967/*[clinic input]
11968str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
INADA Naoki3ae20562017-01-16 20:41:20 +090011970Return True if the string is a digit string, False otherwise.
11971
11972A string is a digit string if all characters in the string are digits and there
11973is at least one character in the string.
11974[clinic start generated code]*/
11975
11976static PyObject *
11977unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011978/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 Py_ssize_t i, length;
11981 int kind;
11982 void *data;
11983
11984 if (PyUnicode_READY(self) == -1)
11985 return NULL;
11986 length = PyUnicode_GET_LENGTH(self);
11987 kind = PyUnicode_KIND(self);
11988 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (length == 1) {
11992 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11993 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011996 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011998 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 for (i = 0; i < length; i++) {
12001 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012002 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012004 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005}
12006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007/*[clinic input]
12008str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
INADA Naoki3ae20562017-01-16 20:41:20 +090012010Return True if the string is a numeric string, False otherwise.
12011
12012A string is numeric if all characters in the string are numeric and there is at
12013least one character in the string.
12014[clinic start generated code]*/
12015
12016static PyObject *
12017unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012018/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 Py_ssize_t i, length;
12021 int kind;
12022 void *data;
12023
12024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026 length = PyUnicode_GET_LENGTH(self);
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (length == 1)
12032 return PyBool_FromLong(
12033 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 for (i = 0; i < length; i++) {
12040 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012041 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012043 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044}
12045
Martin v. Löwis47383402007-08-15 07:32:56 +000012046int
12047PyUnicode_IsIdentifier(PyObject *self)
12048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 int kind;
12050 void *data;
12051 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012052 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (PyUnicode_READY(self) == -1) {
12055 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 }
12058
12059 /* Special case for empty strings */
12060 if (PyUnicode_GET_LENGTH(self) == 0)
12061 return 0;
12062 kind = PyUnicode_KIND(self);
12063 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012064
12065 /* PEP 3131 says that the first character must be in
12066 XID_Start and subsequent characters in XID_Continue,
12067 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012069 letters, digits, underscore). However, given the current
12070 definition of XID_Start and XID_Continue, it is sufficient
12071 to check just for these, except that _ must be allowed
12072 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012074 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012075 return 0;
12076
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012077 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012080 return 1;
12081}
12082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083/*[clinic input]
12084str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086Return True if the string is a valid Python identifier, False otherwise.
12087
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012088Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012089such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012090[clinic start generated code]*/
12091
12092static PyObject *
12093unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012094/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012095{
12096 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12097}
12098
INADA Naoki3ae20562017-01-16 20:41:20 +090012099/*[clinic input]
12100str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012101
INADA Naoki3ae20562017-01-16 20:41:20 +090012102Return True if the string is printable, False otherwise.
12103
12104A string is printable if all of its characters are considered printable in
12105repr() or if it is empty.
12106[clinic start generated code]*/
12107
12108static PyObject *
12109unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012110/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 Py_ssize_t i, length;
12113 int kind;
12114 void *data;
12115
12116 if (PyUnicode_READY(self) == -1)
12117 return NULL;
12118 length = PyUnicode_GET_LENGTH(self);
12119 kind = PyUnicode_KIND(self);
12120 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012121
12122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (length == 1)
12124 return PyBool_FromLong(
12125 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 for (i = 0; i < length; i++) {
12128 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012129 Py_RETURN_FALSE;
12130 }
12131 }
12132 Py_RETURN_TRUE;
12133}
12134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135/*[clinic input]
12136str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
INADA Naoki3ae20562017-01-16 20:41:20 +090012138 iterable: object
12139 /
12140
12141Concatenate any number of strings.
12142
Martin Panter91a88662017-01-24 00:30:06 +000012143The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012144The result is returned as a new string.
12145
12146Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12147[clinic start generated code]*/
12148
12149static PyObject *
12150unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012151/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152{
INADA Naoki3ae20562017-01-16 20:41:20 +090012153 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154}
12155
Martin v. Löwis18e16552006-02-15 17:27:45 +000012156static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012157unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return -1;
12161 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162}
12163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164/*[clinic input]
12165str.ljust as unicode_ljust
12166
12167 width: Py_ssize_t
12168 fillchar: Py_UCS4 = ' '
12169 /
12170
12171Return a left-justified string of length width.
12172
12173Padding is done using the specified fill character (default is a space).
12174[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175
12176static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012177unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12178/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012180 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Victor Stinnerc4b49542011-12-11 22:44:26 +010012183 if (PyUnicode_GET_LENGTH(self) >= width)
12184 return unicode_result_unchanged(self);
12185
12186 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187}
12188
INADA Naoki3ae20562017-01-16 20:41:20 +090012189/*[clinic input]
12190str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
INADA Naoki3ae20562017-01-16 20:41:20 +090012192Return a copy of the string converted to lowercase.
12193[clinic start generated code]*/
12194
12195static PyObject *
12196unicode_lower_impl(PyObject *self)
12197/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012199 if (PyUnicode_READY(self) == -1)
12200 return NULL;
12201 if (PyUnicode_IS_ASCII(self))
12202 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012203 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204}
12205
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012206#define LEFTSTRIP 0
12207#define RIGHTSTRIP 1
12208#define BOTHSTRIP 2
12209
12210/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012211static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012212
INADA Naoki3ae20562017-01-16 20:41:20 +090012213#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012214
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012215/* externally visible for str.strip(unicode) */
12216PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012217_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 void *data;
12220 int kind;
12221 Py_ssize_t i, j, len;
12222 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012223 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12226 return NULL;
12227
12228 kind = PyUnicode_KIND(self);
12229 data = PyUnicode_DATA(self);
12230 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012231 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12233 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012234 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012235
Benjamin Peterson14339b62009-01-31 16:36:08 +000012236 i = 0;
12237 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012238 while (i < len) {
12239 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12240 if (!BLOOM(sepmask, ch))
12241 break;
12242 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12243 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 i++;
12245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012246 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012247
Benjamin Peterson14339b62009-01-31 16:36:08 +000012248 j = len;
12249 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012250 j--;
12251 while (j >= i) {
12252 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12253 if (!BLOOM(sepmask, ch))
12254 break;
12255 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12256 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012258 }
12259
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012261 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012262
Victor Stinner7931d9a2011-11-04 00:22:48 +010012263 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264}
12265
12266PyObject*
12267PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12268{
12269 unsigned char *data;
12270 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012271 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272
Victor Stinnerde636f32011-10-01 03:55:54 +020012273 if (PyUnicode_READY(self) == -1)
12274 return NULL;
12275
Victor Stinner684d5fd2012-05-03 02:32:34 +020012276 length = PyUnicode_GET_LENGTH(self);
12277 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012278
Victor Stinner684d5fd2012-05-03 02:32:34 +020012279 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012280 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281
Victor Stinnerde636f32011-10-01 03:55:54 +020012282 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012283 PyErr_SetString(PyExc_IndexError, "string index out of range");
12284 return NULL;
12285 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012286 if (start >= length || end < start)
12287 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012288
Victor Stinner684d5fd2012-05-03 02:32:34 +020012289 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012290 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012291 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012292 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012293 }
12294 else {
12295 kind = PyUnicode_KIND(self);
12296 data = PyUnicode_1BYTE_DATA(self);
12297 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012298 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012299 length);
12300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
12303static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 Py_ssize_t len, i, j;
12307
12308 if (PyUnicode_READY(self) == -1)
12309 return NULL;
12310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012312
Victor Stinnercc7af722013-04-09 22:39:24 +020012313 if (PyUnicode_IS_ASCII(self)) {
12314 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12315
12316 i = 0;
12317 if (striptype != RIGHTSTRIP) {
12318 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012319 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012320 if (!_Py_ascii_whitespace[ch])
12321 break;
12322 i++;
12323 }
12324 }
12325
12326 j = len;
12327 if (striptype != LEFTSTRIP) {
12328 j--;
12329 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012330 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012331 if (!_Py_ascii_whitespace[ch])
12332 break;
12333 j--;
12334 }
12335 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012336 }
12337 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012338 else {
12339 int kind = PyUnicode_KIND(self);
12340 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012341
Victor Stinnercc7af722013-04-09 22:39:24 +020012342 i = 0;
12343 if (striptype != RIGHTSTRIP) {
12344 while (i < len) {
12345 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12346 if (!Py_UNICODE_ISSPACE(ch))
12347 break;
12348 i++;
12349 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012350 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012351
12352 j = len;
12353 if (striptype != LEFTSTRIP) {
12354 j--;
12355 while (j >= i) {
12356 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12357 if (!Py_UNICODE_ISSPACE(ch))
12358 break;
12359 j--;
12360 }
12361 j++;
12362 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012363 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012364
Victor Stinner7931d9a2011-11-04 00:22:48 +010012365 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366}
12367
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012368
12369static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012370do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012371{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012372 if (sep != NULL && sep != Py_None) {
12373 if (PyUnicode_Check(sep))
12374 return _PyUnicode_XStrip(self, striptype, sep);
12375 else {
12376 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 "%s arg must be None or str",
12378 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012379 return NULL;
12380 }
12381 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012382
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012384}
12385
12386
INADA Naoki3ae20562017-01-16 20:41:20 +090012387/*[clinic input]
12388str.strip as unicode_strip
12389
12390 chars: object = None
12391 /
12392
Victor Stinner0c4a8282017-01-17 02:21:47 +010012393Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012394
12395If chars is given and not None, remove characters in chars instead.
12396[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012397
12398static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012399unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012400/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401{
INADA Naoki3ae20562017-01-16 20:41:20 +090012402 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012403}
12404
12405
INADA Naoki3ae20562017-01-16 20:41:20 +090012406/*[clinic input]
12407str.lstrip as unicode_lstrip
12408
12409 chars: object = NULL
12410 /
12411
12412Return a copy of the string with leading whitespace removed.
12413
12414If chars is given and not None, remove characters in chars instead.
12415[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416
12417static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012418unicode_lstrip_impl(PyObject *self, PyObject *chars)
12419/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420{
INADA Naoki3ae20562017-01-16 20:41:20 +090012421 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012422}
12423
12424
INADA Naoki3ae20562017-01-16 20:41:20 +090012425/*[clinic input]
12426str.rstrip as unicode_rstrip
12427
12428 chars: object = NULL
12429 /
12430
12431Return a copy of the string with trailing whitespace removed.
12432
12433If chars is given and not None, remove characters in chars instead.
12434[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435
12436static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012437unicode_rstrip_impl(PyObject *self, PyObject *chars)
12438/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439{
INADA Naoki3ae20562017-01-16 20:41:20 +090012440 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441}
12442
12443
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012445unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012447 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449
Serhiy Storchaka05997252013-01-26 12:14:02 +020012450 if (len < 1)
12451 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452
Victor Stinnerc4b49542011-12-11 22:44:26 +010012453 /* no repeat, return original string */
12454 if (len == 1)
12455 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012456
Benjamin Petersonbac79492012-01-14 13:34:47 -050012457 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 return NULL;
12459
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012460 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012461 PyErr_SetString(PyExc_OverflowError,
12462 "repeated string is too long");
12463 return NULL;
12464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012466
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012467 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468 if (!u)
12469 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012470 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 if (PyUnicode_GET_LENGTH(str) == 1) {
12473 const int kind = PyUnicode_KIND(str);
12474 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012475 if (kind == PyUnicode_1BYTE_KIND) {
12476 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012477 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012478 }
12479 else if (kind == PyUnicode_2BYTE_KIND) {
12480 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012481 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012482 ucs2[n] = fill_char;
12483 } else {
12484 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12485 assert(kind == PyUnicode_4BYTE_KIND);
12486 for (n = 0; n < len; ++n)
12487 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 }
12490 else {
12491 /* number of characters copied this far */
12492 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012493 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012495 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012499 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502 }
12503
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012504 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012505 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506}
12507
Alexander Belopolsky40018472011-02-26 01:02:56 +000012508PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012509PyUnicode_Replace(PyObject *str,
12510 PyObject *substr,
12511 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012512 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012514 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12515 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012517 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518}
12519
INADA Naoki3ae20562017-01-16 20:41:20 +090012520/*[clinic input]
12521str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
INADA Naoki3ae20562017-01-16 20:41:20 +090012523 old: unicode
12524 new: unicode
12525 count: Py_ssize_t = -1
12526 Maximum number of occurrences to replace.
12527 -1 (the default value) means replace all occurrences.
12528 /
12529
12530Return a copy with all occurrences of substring old replaced by new.
12531
12532If the optional argument count is given, only the first count occurrences are
12533replaced.
12534[clinic start generated code]*/
12535
12536static PyObject *
12537unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12538 Py_ssize_t count)
12539/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012541 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012543 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544}
12545
Alexander Belopolsky40018472011-02-26 01:02:56 +000012546static PyObject *
12547unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012549 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 Py_ssize_t isize;
12551 Py_ssize_t osize, squote, dquote, i, o;
12552 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012553 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012557 return NULL;
12558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 isize = PyUnicode_GET_LENGTH(unicode);
12560 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 /* Compute length of output, quote characters, and
12563 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012564 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 max = 127;
12566 squote = dquote = 0;
12567 ikind = PyUnicode_KIND(unicode);
12568 for (i = 0; i < isize; i++) {
12569 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012570 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012572 case '\'': squote++; break;
12573 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012575 incr = 2;
12576 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 default:
12578 /* Fast-path ASCII */
12579 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012580 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012582 ;
12583 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012586 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012588 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012590 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012592 if (osize > PY_SSIZE_T_MAX - incr) {
12593 PyErr_SetString(PyExc_OverflowError,
12594 "string is too long to generate repr");
12595 return NULL;
12596 }
12597 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 }
12599
12600 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012601 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012603 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 if (dquote)
12605 /* Both squote and dquote present. Use squote,
12606 and escape them */
12607 osize += squote;
12608 else
12609 quote = '"';
12610 }
Victor Stinner55c08782013-04-14 18:45:39 +020012611 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612
12613 repr = PyUnicode_New(osize, max);
12614 if (repr == NULL)
12615 return NULL;
12616 okind = PyUnicode_KIND(repr);
12617 odata = PyUnicode_DATA(repr);
12618
12619 PyUnicode_WRITE(okind, odata, 0, quote);
12620 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012621 if (unchanged) {
12622 _PyUnicode_FastCopyCharacters(repr, 1,
12623 unicode, 0,
12624 isize);
12625 }
12626 else {
12627 for (i = 0, o = 1; i < isize; i++) {
12628 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629
Victor Stinner55c08782013-04-14 18:45:39 +020012630 /* Escape quotes and backslashes */
12631 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012632 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012634 continue;
12635 }
12636
12637 /* Map special whitespace to '\t', \n', '\r' */
12638 if (ch == '\t') {
12639 PyUnicode_WRITE(okind, odata, o++, '\\');
12640 PyUnicode_WRITE(okind, odata, o++, 't');
12641 }
12642 else if (ch == '\n') {
12643 PyUnicode_WRITE(okind, odata, o++, '\\');
12644 PyUnicode_WRITE(okind, odata, o++, 'n');
12645 }
12646 else if (ch == '\r') {
12647 PyUnicode_WRITE(okind, odata, o++, '\\');
12648 PyUnicode_WRITE(okind, odata, o++, 'r');
12649 }
12650
12651 /* Map non-printable US ASCII to '\xhh' */
12652 else if (ch < ' ' || ch == 0x7F) {
12653 PyUnicode_WRITE(okind, odata, o++, '\\');
12654 PyUnicode_WRITE(okind, odata, o++, 'x');
12655 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12656 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12657 }
12658
12659 /* Copy ASCII characters as-is */
12660 else if (ch < 0x7F) {
12661 PyUnicode_WRITE(okind, odata, o++, ch);
12662 }
12663
12664 /* Non-ASCII characters */
12665 else {
12666 /* Map Unicode whitespace and control characters
12667 (categories Z* and C* except ASCII space)
12668 */
12669 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12670 PyUnicode_WRITE(okind, odata, o++, '\\');
12671 /* Map 8-bit characters to '\xhh' */
12672 if (ch <= 0xff) {
12673 PyUnicode_WRITE(okind, odata, o++, 'x');
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12676 }
12677 /* Map 16-bit characters to '\uxxxx' */
12678 else if (ch <= 0xffff) {
12679 PyUnicode_WRITE(okind, odata, o++, 'u');
12680 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12681 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12682 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12683 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12684 }
12685 /* Map 21-bit characters to '\U00xxxxxx' */
12686 else {
12687 PyUnicode_WRITE(okind, odata, o++, 'U');
12688 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12689 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12692 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12693 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12694 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12695 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12696 }
12697 }
12698 /* Copy characters as-is */
12699 else {
12700 PyUnicode_WRITE(okind, odata, o++, ch);
12701 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012702 }
12703 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012706 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012707 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708}
12709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012710PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712\n\
12713Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012714such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715arguments start and end are interpreted as in slice notation.\n\
12716\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012717Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
12719static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012722 /* initialize variables to prevent gcc warning */
12723 PyObject *substring = NULL;
12724 Py_ssize_t start = 0;
12725 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012728 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012731 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012734 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 if (result == -2)
12737 return NULL;
12738
Christian Heimes217cfd12007-12-02 14:31:20 +000012739 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740}
12741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012742PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012745Return the highest index in S where substring sub is found,\n\
12746such that sub is contained within S[start:end]. Optional\n\
12747arguments start and end are interpreted as in slice notation.\n\
12748\n\
12749Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750
12751static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012754 /* initialize variables to prevent gcc warning */
12755 PyObject *substring = NULL;
12756 Py_ssize_t start = 0;
12757 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012758 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012760 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012763 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012766 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 if (result == -2)
12769 return NULL;
12770
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771 if (result < 0) {
12772 PyErr_SetString(PyExc_ValueError, "substring not found");
12773 return NULL;
12774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775
Christian Heimes217cfd12007-12-02 14:31:20 +000012776 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777}
12778
INADA Naoki3ae20562017-01-16 20:41:20 +090012779/*[clinic input]
12780str.rjust as unicode_rjust
12781
12782 width: Py_ssize_t
12783 fillchar: Py_UCS4 = ' '
12784 /
12785
12786Return a right-justified string of length width.
12787
12788Padding is done using the specified fill character (default is a space).
12789[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790
12791static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012792unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12793/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012795 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796 return NULL;
12797
Victor Stinnerc4b49542011-12-11 22:44:26 +010012798 if (PyUnicode_GET_LENGTH(self) >= width)
12799 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800
Victor Stinnerc4b49542011-12-11 22:44:26 +010012801 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802}
12803
Alexander Belopolsky40018472011-02-26 01:02:56 +000012804PyObject *
12805PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012807 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811}
12812
INADA Naoki3ae20562017-01-16 20:41:20 +090012813/*[clinic input]
12814str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
INADA Naoki3ae20562017-01-16 20:41:20 +090012816 sep: object = None
12817 The delimiter according which to split the string.
12818 None (the default value) means split according to any whitespace,
12819 and discard empty strings from the result.
12820 maxsplit: Py_ssize_t = -1
12821 Maximum number of splits to do.
12822 -1 (the default value) means no limit.
12823
12824Return a list of the words in the string, using sep as the delimiter string.
12825[clinic start generated code]*/
12826
12827static PyObject *
12828unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12829/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830{
INADA Naoki3ae20562017-01-16 20:41:20 +090012831 if (sep == Py_None)
12832 return split(self, NULL, maxsplit);
12833 if (PyUnicode_Check(sep))
12834 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012835
Victor Stinner998b8062018-09-12 00:23:25 +020012836 PyErr_Format(PyExc_TypeError,
12837 "must be str or None, not %.100s",
12838 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840}
12841
Thomas Wouters477c8d52006-05-27 19:21:47 +000012842PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012843PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012846 int kind1, kind2;
12847 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012850 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852
Victor Stinner14f8f022011-10-05 20:58:25 +020012853 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 len1 = PyUnicode_GET_LENGTH(str_obj);
12856 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012857 if (kind1 < kind2 || len1 < len2) {
12858 _Py_INCREF_UNICODE_EMPTY();
12859 if (!unicode_empty)
12860 out = NULL;
12861 else {
12862 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12863 Py_DECREF(unicode_empty);
12864 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012865 return out;
12866 }
12867 buf1 = PyUnicode_DATA(str_obj);
12868 buf2 = PyUnicode_DATA(sep_obj);
12869 if (kind2 != kind1) {
12870 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12871 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012872 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012875 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012877 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12878 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12879 else
12880 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 break;
12882 case PyUnicode_2BYTE_KIND:
12883 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12884 break;
12885 case PyUnicode_4BYTE_KIND:
12886 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887 break;
12888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012889 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012892 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894
12895 return out;
12896}
12897
12898
12899PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012903 int kind1, kind2;
12904 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012910 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 len1 = PyUnicode_GET_LENGTH(str_obj);
12913 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012914 if (kind1 < kind2 || len1 < len2) {
12915 _Py_INCREF_UNICODE_EMPTY();
12916 if (!unicode_empty)
12917 out = NULL;
12918 else {
12919 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12920 Py_DECREF(unicode_empty);
12921 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 return out;
12923 }
12924 buf1 = PyUnicode_DATA(str_obj);
12925 buf2 = PyUnicode_DATA(sep_obj);
12926 if (kind2 != kind1) {
12927 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12928 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012934 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12935 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936 else
12937 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 break;
12939 case PyUnicode_2BYTE_KIND:
12940 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941 break;
12942 case PyUnicode_4BYTE_KIND:
12943 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944 break;
12945 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012946 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951
12952 return out;
12953}
12954
INADA Naoki3ae20562017-01-16 20:41:20 +090012955/*[clinic input]
12956str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957
INADA Naoki3ae20562017-01-16 20:41:20 +090012958 sep: object
12959 /
12960
12961Partition the string into three parts using the given separator.
12962
12963This will search for the separator in the string. If the separator is found,
12964returns a 3-tuple containing the part before the separator, the separator
12965itself, and the part after it.
12966
12967If the separator is not found, returns a 3-tuple containing the original string
12968and two empty strings.
12969[clinic start generated code]*/
12970
12971static PyObject *
12972unicode_partition(PyObject *self, PyObject *sep)
12973/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974{
INADA Naoki3ae20562017-01-16 20:41:20 +090012975 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976}
12977
INADA Naoki3ae20562017-01-16 20:41:20 +090012978/*[clinic input]
12979str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980
INADA Naoki3ae20562017-01-16 20:41:20 +090012981Partition the string into three parts using the given separator.
12982
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012983This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090012984the separator is found, returns a 3-tuple containing the part before the
12985separator, the separator itself, and the part after it.
12986
12987If the separator is not found, returns a 3-tuple containing two empty strings
12988and the original string.
12989[clinic start generated code]*/
12990
12991static PyObject *
12992unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012993/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994{
INADA Naoki3ae20562017-01-16 20:41:20 +090012995 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996}
12997
Alexander Belopolsky40018472011-02-26 01:02:56 +000012998PyObject *
12999PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013000{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013001 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013002 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013003
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013004 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013005}
13006
INADA Naoki3ae20562017-01-16 20:41:20 +090013007/*[clinic input]
13008str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013009
INADA Naoki3ae20562017-01-16 20:41:20 +090013010Return a list of the words in the string, using sep as the delimiter string.
13011
13012Splits are done starting at the end of the string and working to the front.
13013[clinic start generated code]*/
13014
13015static PyObject *
13016unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13017/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013018{
INADA Naoki3ae20562017-01-16 20:41:20 +090013019 if (sep == Py_None)
13020 return rsplit(self, NULL, maxsplit);
13021 if (PyUnicode_Check(sep))
13022 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013023
Victor Stinner998b8062018-09-12 00:23:25 +020013024 PyErr_Format(PyExc_TypeError,
13025 "must be str or None, not %.100s",
13026 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013027 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013028}
13029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030/*[clinic input]
13031str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013033 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013034
13035Return a list of the lines in the string, breaking at line boundaries.
13036
13037Line breaks are not included in the resulting list unless keepends is given and
13038true.
13039[clinic start generated code]*/
13040
13041static PyObject *
13042unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013043/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013045 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046}
13047
13048static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013049PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013051 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052}
13053
INADA Naoki3ae20562017-01-16 20:41:20 +090013054/*[clinic input]
13055str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056
INADA Naoki3ae20562017-01-16 20:41:20 +090013057Convert uppercase characters to lowercase and lowercase characters to uppercase.
13058[clinic start generated code]*/
13059
13060static PyObject *
13061unicode_swapcase_impl(PyObject *self)
13062/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013064 if (PyUnicode_READY(self) == -1)
13065 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013066 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
Larry Hastings61272b72014-01-07 12:41:53 -080013069/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013070
Larry Hastings31826802013-10-19 00:09:25 -070013071@staticmethod
13072str.maketrans as unicode_maketrans
13073
13074 x: object
13075
13076 y: unicode=NULL
13077
13078 z: unicode=NULL
13079
13080 /
13081
13082Return a translation table usable for str.translate().
13083
13084If there is only one argument, it must be a dictionary mapping Unicode
13085ordinals (integers) or characters to Unicode ordinals, strings or None.
13086Character keys will be then converted to ordinals.
13087If there are two arguments, they must be strings of equal length, and
13088in the resulting dictionary, each character in x will be mapped to the
13089character at the same position in y. If there is a third argument, it
13090must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013091[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013092
Larry Hastings31826802013-10-19 00:09:25 -070013093static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013094unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013095/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013096{
Georg Brandlceee0772007-11-27 23:48:05 +000013097 PyObject *new = NULL, *key, *value;
13098 Py_ssize_t i = 0;
13099 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013100
Georg Brandlceee0772007-11-27 23:48:05 +000013101 new = PyDict_New();
13102 if (!new)
13103 return NULL;
13104 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 int x_kind, y_kind, z_kind;
13106 void *x_data, *y_data, *z_data;
13107
Georg Brandlceee0772007-11-27 23:48:05 +000013108 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013109 if (!PyUnicode_Check(x)) {
13110 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13111 "be a string if there is a second argument");
13112 goto err;
13113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013115 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13116 "arguments must have equal length");
13117 goto err;
13118 }
13119 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 x_kind = PyUnicode_KIND(x);
13121 y_kind = PyUnicode_KIND(y);
13122 x_data = PyUnicode_DATA(x);
13123 y_data = PyUnicode_DATA(y);
13124 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13125 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013126 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013127 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013128 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013129 if (!value) {
13130 Py_DECREF(key);
13131 goto err;
13132 }
Georg Brandlceee0772007-11-27 23:48:05 +000013133 res = PyDict_SetItem(new, key, value);
13134 Py_DECREF(key);
13135 Py_DECREF(value);
13136 if (res < 0)
13137 goto err;
13138 }
13139 /* create entries for deleting chars in z */
13140 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 z_kind = PyUnicode_KIND(z);
13142 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013143 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013145 if (!key)
13146 goto err;
13147 res = PyDict_SetItem(new, key, Py_None);
13148 Py_DECREF(key);
13149 if (res < 0)
13150 goto err;
13151 }
13152 }
13153 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 int kind;
13155 void *data;
13156
Georg Brandlceee0772007-11-27 23:48:05 +000013157 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013158 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013159 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13160 "to maketrans it must be a dict");
13161 goto err;
13162 }
13163 /* copy entries into the new dict, converting string keys to int keys */
13164 while (PyDict_Next(x, &i, &key, &value)) {
13165 if (PyUnicode_Check(key)) {
13166 /* convert string keys to integer keys */
13167 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013168 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013169 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13170 "table must be of length 1");
13171 goto err;
13172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 kind = PyUnicode_KIND(key);
13174 data = PyUnicode_DATA(key);
13175 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013176 if (!newkey)
13177 goto err;
13178 res = PyDict_SetItem(new, newkey, value);
13179 Py_DECREF(newkey);
13180 if (res < 0)
13181 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013182 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013183 /* just keep integer keys */
13184 if (PyDict_SetItem(new, key, value) < 0)
13185 goto err;
13186 } else {
13187 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13188 "be strings or integers");
13189 goto err;
13190 }
13191 }
13192 }
13193 return new;
13194 err:
13195 Py_DECREF(new);
13196 return NULL;
13197}
13198
INADA Naoki3ae20562017-01-16 20:41:20 +090013199/*[clinic input]
13200str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
INADA Naoki3ae20562017-01-16 20:41:20 +090013202 table: object
13203 Translation table, which must be a mapping of Unicode ordinals to
13204 Unicode ordinals, strings, or None.
13205 /
13206
13207Replace each character in the string using the given translation table.
13208
13209The table must implement lookup/indexing via __getitem__, for instance a
13210dictionary or list. If this operation raises LookupError, the character is
13211left untouched. Characters mapped to None are deleted.
13212[clinic start generated code]*/
13213
13214static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013216/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219}
13220
INADA Naoki3ae20562017-01-16 20:41:20 +090013221/*[clinic input]
13222str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224Return a copy of the string converted to uppercase.
13225[clinic start generated code]*/
13226
13227static PyObject *
13228unicode_upper_impl(PyObject *self)
13229/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013231 if (PyUnicode_READY(self) == -1)
13232 return NULL;
13233 if (PyUnicode_IS_ASCII(self))
13234 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013235 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236}
13237
INADA Naoki3ae20562017-01-16 20:41:20 +090013238/*[clinic input]
13239str.zfill as unicode_zfill
13240
13241 width: Py_ssize_t
13242 /
13243
13244Pad a numeric string with zeros on the left, to fill a field of the given width.
13245
13246The string is never truncated.
13247[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248
13249static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013250unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013251/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013253 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013254 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 int kind;
13256 void *data;
13257 Py_UCS4 chr;
13258
Benjamin Petersonbac79492012-01-14 13:34:47 -050013259 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261
Victor Stinnerc4b49542011-12-11 22:44:26 +010013262 if (PyUnicode_GET_LENGTH(self) >= width)
13263 return unicode_result_unchanged(self);
13264
13265 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
13267 u = pad(self, fill, 0, '0');
13268
Walter Dörwald068325e2002-04-15 13:36:47 +000013269 if (u == NULL)
13270 return NULL;
13271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 kind = PyUnicode_KIND(u);
13273 data = PyUnicode_DATA(u);
13274 chr = PyUnicode_READ(kind, data, fill);
13275
13276 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 PyUnicode_WRITE(kind, data, 0, chr);
13279 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280 }
13281
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013282 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013283 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285
13286#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013287static PyObject *
13288unicode__decimal2ascii(PyObject *self)
13289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013291}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292#endif
13293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013294PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013297Return True if S starts with the specified prefix, False otherwise.\n\
13298With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013299With optional end, stop comparing S at that position.\n\
13300prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
13302static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013303unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013306 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013307 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013308 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013309 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013310 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311
Jesus Ceaac451502011-04-20 17:09:23 +020013312 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013314 if (PyTuple_Check(subobj)) {
13315 Py_ssize_t i;
13316 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013317 substring = PyTuple_GET_ITEM(subobj, i);
13318 if (!PyUnicode_Check(substring)) {
13319 PyErr_Format(PyExc_TypeError,
13320 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013321 "not %.100s",
13322 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013323 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013324 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013325 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013326 if (result == -1)
13327 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013328 if (result) {
13329 Py_RETURN_TRUE;
13330 }
13331 }
13332 /* nothing matched */
13333 Py_RETURN_FALSE;
13334 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013335 if (!PyUnicode_Check(subobj)) {
13336 PyErr_Format(PyExc_TypeError,
13337 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013338 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013339 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013340 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013341 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013342 if (result == -1)
13343 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013344 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345}
13346
13347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013348PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013351Return True if S ends with the specified suffix, False otherwise.\n\
13352With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013353With optional end, stop comparing S at that position.\n\
13354suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
13356static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013357unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013360 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013361 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013362 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013363 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013364 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365
Jesus Ceaac451502011-04-20 17:09:23 +020013366 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013368 if (PyTuple_Check(subobj)) {
13369 Py_ssize_t i;
13370 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013371 substring = PyTuple_GET_ITEM(subobj, i);
13372 if (!PyUnicode_Check(substring)) {
13373 PyErr_Format(PyExc_TypeError,
13374 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013375 "not %.100s",
13376 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013378 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013380 if (result == -1)
13381 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 if (result) {
13383 Py_RETURN_TRUE;
13384 }
13385 }
13386 Py_RETURN_FALSE;
13387 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013388 if (!PyUnicode_Check(subobj)) {
13389 PyErr_Format(PyExc_TypeError,
13390 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013391 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013393 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013395 if (result == -1)
13396 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398}
13399
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013400static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013401_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013402{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013403 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13404 writer->data = PyUnicode_DATA(writer->buffer);
13405
13406 if (!writer->readonly) {
13407 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013408 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013409 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013410 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013411 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13412 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13413 writer->kind = PyUnicode_WCHAR_KIND;
13414 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13415
Victor Stinner8f674cc2013-04-17 23:02:17 +020013416 /* Copy-on-write mode: set buffer size to 0 so
13417 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13418 * next write. */
13419 writer->size = 0;
13420 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013421}
13422
Victor Stinnerd3f08822012-05-29 12:57:52 +020013423void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013424_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013425{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013426 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013427
13428 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013429 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013430
13431 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13432 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13433 writer->kind = PyUnicode_WCHAR_KIND;
13434 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013435}
13436
Victor Stinnerd3f08822012-05-29 12:57:52 +020013437int
13438_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13439 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013440{
13441 Py_ssize_t newlen;
13442 PyObject *newbuffer;
13443
Victor Stinner2740e462016-09-06 16:58:36 -070013444 assert(maxchar <= MAX_UNICODE);
13445
Victor Stinnerca9381e2015-09-22 00:58:32 +020013446 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013447 assert((maxchar > writer->maxchar && length >= 0)
13448 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013449
Victor Stinner202fdca2012-05-07 12:47:02 +020013450 if (length > PY_SSIZE_T_MAX - writer->pos) {
13451 PyErr_NoMemory();
13452 return -1;
13453 }
13454 newlen = writer->pos + length;
13455
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013456 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013457
Victor Stinnerd3f08822012-05-29 12:57:52 +020013458 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013459 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013460 if (writer->overallocate
13461 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13462 /* overallocate to limit the number of realloc() */
13463 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013464 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013465 if (newlen < writer->min_length)
13466 newlen = writer->min_length;
13467
Victor Stinnerd3f08822012-05-29 12:57:52 +020013468 writer->buffer = PyUnicode_New(newlen, maxchar);
13469 if (writer->buffer == NULL)
13470 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013471 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013472 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013473 if (writer->overallocate
13474 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13475 /* overallocate to limit the number of realloc() */
13476 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013478 if (newlen < writer->min_length)
13479 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013481 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013482 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013483 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013484 newbuffer = PyUnicode_New(newlen, maxchar);
13485 if (newbuffer == NULL)
13486 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13488 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013489 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013490 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013491 }
13492 else {
13493 newbuffer = resize_compact(writer->buffer, newlen);
13494 if (newbuffer == NULL)
13495 return -1;
13496 }
13497 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013498 }
13499 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013500 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013501 newbuffer = PyUnicode_New(writer->size, maxchar);
13502 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013503 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13505 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013506 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013507 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013508 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013509 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013510
13511#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013512}
13513
Victor Stinnerca9381e2015-09-22 00:58:32 +020013514int
13515_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13516 enum PyUnicode_Kind kind)
13517{
13518 Py_UCS4 maxchar;
13519
13520 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13521 assert(writer->kind < kind);
13522
13523 switch (kind)
13524 {
13525 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13526 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13527 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13528 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013529 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013530 }
13531
13532 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13533}
13534
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013535static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013536_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013537{
Victor Stinner2740e462016-09-06 16:58:36 -070013538 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013539 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13540 return -1;
13541 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13542 writer->pos++;
13543 return 0;
13544}
13545
13546int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013547_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13548{
13549 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13550}
13551
13552int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13554{
13555 Py_UCS4 maxchar;
13556 Py_ssize_t len;
13557
13558 if (PyUnicode_READY(str) == -1)
13559 return -1;
13560 len = PyUnicode_GET_LENGTH(str);
13561 if (len == 0)
13562 return 0;
13563 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13564 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013565 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013566 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568 Py_INCREF(str);
13569 writer->buffer = str;
13570 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571 writer->pos += len;
13572 return 0;
13573 }
13574 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13575 return -1;
13576 }
13577 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13578 str, 0, len);
13579 writer->pos += len;
13580 return 0;
13581}
13582
Victor Stinnere215d962012-10-06 23:03:36 +020013583int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013584_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13585 Py_ssize_t start, Py_ssize_t end)
13586{
13587 Py_UCS4 maxchar;
13588 Py_ssize_t len;
13589
13590 if (PyUnicode_READY(str) == -1)
13591 return -1;
13592
13593 assert(0 <= start);
13594 assert(end <= PyUnicode_GET_LENGTH(str));
13595 assert(start <= end);
13596
13597 if (end == 0)
13598 return 0;
13599
13600 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13601 return _PyUnicodeWriter_WriteStr(writer, str);
13602
13603 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13604 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13605 else
13606 maxchar = writer->maxchar;
13607 len = end - start;
13608
13609 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13610 return -1;
13611
13612 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13613 str, start, len);
13614 writer->pos += len;
13615 return 0;
13616}
13617
13618int
Victor Stinner4a587072013-11-19 12:54:53 +010013619_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13620 const char *ascii, Py_ssize_t len)
13621{
13622 if (len == -1)
13623 len = strlen(ascii);
13624
13625 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13626
13627 if (writer->buffer == NULL && !writer->overallocate) {
13628 PyObject *str;
13629
13630 str = _PyUnicode_FromASCII(ascii, len);
13631 if (str == NULL)
13632 return -1;
13633
13634 writer->readonly = 1;
13635 writer->buffer = str;
13636 _PyUnicodeWriter_Update(writer);
13637 writer->pos += len;
13638 return 0;
13639 }
13640
13641 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13642 return -1;
13643
13644 switch (writer->kind)
13645 {
13646 case PyUnicode_1BYTE_KIND:
13647 {
13648 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13649 Py_UCS1 *data = writer->data;
13650
Christian Heimesf051e432016-09-13 20:22:02 +020013651 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013652 break;
13653 }
13654 case PyUnicode_2BYTE_KIND:
13655 {
13656 _PyUnicode_CONVERT_BYTES(
13657 Py_UCS1, Py_UCS2,
13658 ascii, ascii + len,
13659 (Py_UCS2 *)writer->data + writer->pos);
13660 break;
13661 }
13662 case PyUnicode_4BYTE_KIND:
13663 {
13664 _PyUnicode_CONVERT_BYTES(
13665 Py_UCS1, Py_UCS4,
13666 ascii, ascii + len,
13667 (Py_UCS4 *)writer->data + writer->pos);
13668 break;
13669 }
13670 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013671 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013672 }
13673
13674 writer->pos += len;
13675 return 0;
13676}
13677
13678int
13679_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13680 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013681{
13682 Py_UCS4 maxchar;
13683
13684 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13685 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13686 return -1;
13687 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13688 writer->pos += len;
13689 return 0;
13690}
13691
Victor Stinnerd3f08822012-05-29 12:57:52 +020013692PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013693_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013694{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013695 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013696
Victor Stinnerd3f08822012-05-29 12:57:52 +020013697 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013698 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013699 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013700 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013701
13702 str = writer->buffer;
13703 writer->buffer = NULL;
13704
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013705 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013706 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13707 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013709
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013710 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13711 PyObject *str2;
13712 str2 = resize_compact(str, writer->pos);
13713 if (str2 == NULL) {
13714 Py_DECREF(str);
13715 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013716 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013717 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013718 }
13719
Victor Stinner15a0bd32013-07-08 22:29:55 +020013720 assert(_PyUnicode_CheckConsistency(str, 1));
13721 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013722}
13723
Victor Stinnerd3f08822012-05-29 12:57:52 +020013724void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013725_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013726{
13727 Py_CLEAR(writer->buffer);
13728}
13729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013730#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013731
13732PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013734\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013735Return a formatted version of S, using substitutions from args and kwargs.\n\
13736The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013737
Eric Smith27bbca62010-11-04 17:06:58 +000013738PyDoc_STRVAR(format_map__doc__,
13739 "S.format_map(mapping) -> str\n\
13740\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013741Return a formatted version of S, using substitutions from mapping.\n\
13742The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013743
INADA Naoki3ae20562017-01-16 20:41:20 +090013744/*[clinic input]
13745str.__format__ as unicode___format__
13746
13747 format_spec: unicode
13748 /
13749
13750Return a formatted version of the string as described by format_spec.
13751[clinic start generated code]*/
13752
Eric Smith4a7d76d2008-05-30 18:10:19 +000013753static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013754unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013755/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013756{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013757 _PyUnicodeWriter writer;
13758 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013759
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 if (PyUnicode_READY(self) == -1)
13761 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013762 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013763 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13764 self, format_spec, 0,
13765 PyUnicode_GET_LENGTH(format_spec));
13766 if (ret == -1) {
13767 _PyUnicodeWriter_Dealloc(&writer);
13768 return NULL;
13769 }
13770 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013771}
13772
INADA Naoki3ae20562017-01-16 20:41:20 +090013773/*[clinic input]
13774str.__sizeof__ as unicode_sizeof
13775
13776Return the size of the string in memory, in bytes.
13777[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013778
13779static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013780unicode_sizeof_impl(PyObject *self)
13781/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 Py_ssize_t size;
13784
13785 /* If it's a compact object, account for base structure +
13786 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013787 if (PyUnicode_IS_COMPACT_ASCII(self))
13788 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13789 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013791 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013792 else {
13793 /* If it is a two-block object, account for base object, and
13794 for character block if present. */
13795 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013796 if (_PyUnicode_DATA_ANY(self))
13797 size += (PyUnicode_GET_LENGTH(self) + 1) *
13798 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 }
13800 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013801 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013802 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13803 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13804 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13805 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013806
13807 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013808}
13809
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013810static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013811unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013812{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013813 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814 if (!copy)
13815 return NULL;
13816 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013817}
13818
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013820 UNICODE_ENCODE_METHODDEF
13821 UNICODE_REPLACE_METHODDEF
13822 UNICODE_SPLIT_METHODDEF
13823 UNICODE_RSPLIT_METHODDEF
13824 UNICODE_JOIN_METHODDEF
13825 UNICODE_CAPITALIZE_METHODDEF
13826 UNICODE_CASEFOLD_METHODDEF
13827 UNICODE_TITLE_METHODDEF
13828 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013829 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013830 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013831 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013832 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013833 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013834 UNICODE_LJUST_METHODDEF
13835 UNICODE_LOWER_METHODDEF
13836 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013837 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13838 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013839 UNICODE_RJUST_METHODDEF
13840 UNICODE_RSTRIP_METHODDEF
13841 UNICODE_RPARTITION_METHODDEF
13842 UNICODE_SPLITLINES_METHODDEF
13843 UNICODE_STRIP_METHODDEF
13844 UNICODE_SWAPCASE_METHODDEF
13845 UNICODE_TRANSLATE_METHODDEF
13846 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013847 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13848 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013849 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013850 UNICODE_ISLOWER_METHODDEF
13851 UNICODE_ISUPPER_METHODDEF
13852 UNICODE_ISTITLE_METHODDEF
13853 UNICODE_ISSPACE_METHODDEF
13854 UNICODE_ISDECIMAL_METHODDEF
13855 UNICODE_ISDIGIT_METHODDEF
13856 UNICODE_ISNUMERIC_METHODDEF
13857 UNICODE_ISALPHA_METHODDEF
13858 UNICODE_ISALNUM_METHODDEF
13859 UNICODE_ISIDENTIFIER_METHODDEF
13860 UNICODE_ISPRINTABLE_METHODDEF
13861 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013862 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013863 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013864 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013865 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013866 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013867#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013868 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013869 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870#endif
13871
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013872 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013873 {NULL, NULL}
13874};
13875
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013876static PyObject *
13877unicode_mod(PyObject *v, PyObject *w)
13878{
Brian Curtindfc80e32011-08-10 20:28:54 -050013879 if (!PyUnicode_Check(v))
13880 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013882}
13883
13884static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 0, /*nb_add*/
13886 0, /*nb_subtract*/
13887 0, /*nb_multiply*/
13888 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013889};
13890
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 (lenfunc) unicode_length, /* sq_length */
13893 PyUnicode_Concat, /* sq_concat */
13894 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13895 (ssizeargfunc) unicode_getitem, /* sq_item */
13896 0, /* sq_slice */
13897 0, /* sq_ass_item */
13898 0, /* sq_ass_slice */
13899 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013900};
13901
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013902static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013903unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 if (PyUnicode_READY(self) == -1)
13906 return NULL;
13907
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013908 if (PyIndex_Check(item)) {
13909 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013910 if (i == -1 && PyErr_Occurred())
13911 return NULL;
13912 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013913 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013914 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013915 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013916 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013917 PyObject *result;
13918 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013919 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013920 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013921
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013922 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013923 return NULL;
13924 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013925 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13926 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013927
13928 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013929 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013931 slicelength == PyUnicode_GET_LENGTH(self)) {
13932 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013933 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013935 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013936 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013937 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013938 src_kind = PyUnicode_KIND(self);
13939 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013940 if (!PyUnicode_IS_ASCII(self)) {
13941 kind_limit = kind_maxchar_limit(src_kind);
13942 max_char = 0;
13943 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13944 ch = PyUnicode_READ(src_kind, src_data, cur);
13945 if (ch > max_char) {
13946 max_char = ch;
13947 if (max_char >= kind_limit)
13948 break;
13949 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013950 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013951 }
Victor Stinner55c99112011-10-13 01:17:06 +020013952 else
13953 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013954 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013955 if (result == NULL)
13956 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013957 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013958 dest_data = PyUnicode_DATA(result);
13959
13960 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013961 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13962 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013963 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013964 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013965 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013966 } else {
13967 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13968 return NULL;
13969 }
13970}
13971
13972static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 (lenfunc)unicode_length, /* mp_length */
13974 (binaryfunc)unicode_subscript, /* mp_subscript */
13975 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013976};
13977
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979/* Helpers for PyUnicode_Format() */
13980
Victor Stinnera47082312012-10-04 02:19:54 +020013981struct unicode_formatter_t {
13982 PyObject *args;
13983 int args_owned;
13984 Py_ssize_t arglen, argidx;
13985 PyObject *dict;
13986
13987 enum PyUnicode_Kind fmtkind;
13988 Py_ssize_t fmtcnt, fmtpos;
13989 void *fmtdata;
13990 PyObject *fmtstr;
13991
13992 _PyUnicodeWriter writer;
13993};
13994
13995struct unicode_format_arg_t {
13996 Py_UCS4 ch;
13997 int flags;
13998 Py_ssize_t width;
13999 int prec;
14000 int sign;
14001};
14002
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014004unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005{
Victor Stinnera47082312012-10-04 02:19:54 +020014006 Py_ssize_t argidx = ctx->argidx;
14007
14008 if (argidx < ctx->arglen) {
14009 ctx->argidx++;
14010 if (ctx->arglen < 0)
14011 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 else
Victor Stinnera47082312012-10-04 02:19:54 +020014013 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014014 }
14015 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017 return NULL;
14018}
14019
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014020/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014021
Victor Stinnera47082312012-10-04 02:19:54 +020014022/* Format a float into the writer if the writer is not NULL, or into *p_output
14023 otherwise.
14024
14025 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014026static int
Victor Stinnera47082312012-10-04 02:19:54 +020014027formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14028 PyObject **p_output,
14029 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014031 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014033 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014034 int prec;
14035 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014036
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037 x = PyFloat_AsDouble(v);
14038 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014039 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014040
Victor Stinnera47082312012-10-04 02:19:54 +020014041 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014042 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014044
Victor Stinnera47082312012-10-04 02:19:54 +020014045 if (arg->flags & F_ALT)
14046 dtoa_flags = Py_DTSF_ALT;
14047 else
14048 dtoa_flags = 0;
14049 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014050 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014051 return -1;
14052 len = strlen(p);
14053 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014054 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014055 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014056 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014057 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058 }
14059 else
14060 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014061 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014062 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063}
14064
Victor Stinnerd0880d52012-04-27 23:40:13 +020014065/* formatlong() emulates the format codes d, u, o, x and X, and
14066 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14067 * Python's regular ints.
14068 * Return value: a new PyUnicodeObject*, or NULL if error.
14069 * The output string is of the form
14070 * "-"? ("0x" | "0X")? digit+
14071 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14072 * set in flags. The case of hex digits will be correct,
14073 * There will be at least prec digits, zero-filled on the left if
14074 * necessary to get that many.
14075 * val object to be converted
14076 * flags bitmask of format flags; only F_ALT is looked at
14077 * prec minimum number of digits; 0-fill on left if needed
14078 * type a character in [duoxX]; u acts the same as d
14079 *
14080 * CAUTION: o, x and X conversions on regular ints can never
14081 * produce a '-' sign, but can for Python's unbounded ints.
14082 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014083PyObject *
14084_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014085{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014086 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014088 Py_ssize_t i;
14089 int sign; /* 1 if '-', else 0 */
14090 int len; /* number of characters */
14091 Py_ssize_t llen;
14092 int numdigits; /* len == numnondigits + numdigits */
14093 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014094
Victor Stinnerd0880d52012-04-27 23:40:13 +020014095 /* Avoid exceeding SSIZE_T_MAX */
14096 if (prec > INT_MAX-3) {
14097 PyErr_SetString(PyExc_OverflowError,
14098 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014100 }
14101
14102 assert(PyLong_Check(val));
14103
14104 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014106 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014107 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014108 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014109 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014110 /* int and int subclasses should print numerically when a numeric */
14111 /* format code is used (see issue18780) */
14112 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014113 break;
14114 case 'o':
14115 numnondigits = 2;
14116 result = PyNumber_ToBase(val, 8);
14117 break;
14118 case 'x':
14119 case 'X':
14120 numnondigits = 2;
14121 result = PyNumber_ToBase(val, 16);
14122 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014123 }
14124 if (!result)
14125 return NULL;
14126
14127 assert(unicode_modifiable(result));
14128 assert(PyUnicode_IS_READY(result));
14129 assert(PyUnicode_IS_ASCII(result));
14130
14131 /* To modify the string in-place, there can only be one reference. */
14132 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014133 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014134 PyErr_BadInternalCall();
14135 return NULL;
14136 }
14137 buf = PyUnicode_DATA(result);
14138 llen = PyUnicode_GET_LENGTH(result);
14139 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014140 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014141 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014142 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014143 return NULL;
14144 }
14145 len = (int)llen;
14146 sign = buf[0] == '-';
14147 numnondigits += sign;
14148 numdigits = len - numnondigits;
14149 assert(numdigits > 0);
14150
14151 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014152 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014153 (type == 'o' || type == 'x' || type == 'X'))) {
14154 assert(buf[sign] == '0');
14155 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14156 buf[sign+1] == 'o');
14157 numnondigits -= 2;
14158 buf += 2;
14159 len -= 2;
14160 if (sign)
14161 buf[0] = '-';
14162 assert(len == numnondigits + numdigits);
14163 assert(numdigits > 0);
14164 }
14165
14166 /* Fill with leading zeroes to meet minimum width. */
14167 if (prec > numdigits) {
14168 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14169 numnondigits + prec);
14170 char *b1;
14171 if (!r1) {
14172 Py_DECREF(result);
14173 return NULL;
14174 }
14175 b1 = PyBytes_AS_STRING(r1);
14176 for (i = 0; i < numnondigits; ++i)
14177 *b1++ = *buf++;
14178 for (i = 0; i < prec - numdigits; i++)
14179 *b1++ = '0';
14180 for (i = 0; i < numdigits; i++)
14181 *b1++ = *buf++;
14182 *b1 = '\0';
14183 Py_DECREF(result);
14184 result = r1;
14185 buf = PyBytes_AS_STRING(result);
14186 len = numnondigits + prec;
14187 }
14188
14189 /* Fix up case for hex conversions. */
14190 if (type == 'X') {
14191 /* Need to convert all lower case letters to upper case.
14192 and need to convert 0x to 0X (and -0x to -0X). */
14193 for (i = 0; i < len; i++)
14194 if (buf[i] >= 'a' && buf[i] <= 'x')
14195 buf[i] -= 'a'-'A';
14196 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 if (!PyUnicode_Check(result)
14198 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014200 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 Py_DECREF(result);
14202 result = unicode;
14203 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014204 else if (len != PyUnicode_GET_LENGTH(result)) {
14205 if (PyUnicode_Resize(&result, len) < 0)
14206 Py_CLEAR(result);
14207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014209}
14210
Ethan Furmandf3ed242014-01-05 06:50:30 -080014211/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014212 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014213 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014214 * -1 and raise an exception on error */
14215static int
Victor Stinnera47082312012-10-04 02:19:54 +020014216mainformatlong(PyObject *v,
14217 struct unicode_format_arg_t *arg,
14218 PyObject **p_output,
14219 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014220{
14221 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014222 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014223
14224 if (!PyNumber_Check(v))
14225 goto wrongtype;
14226
Ethan Furman9ab74802014-03-21 06:38:46 -070014227 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014228 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014229 if (type == 'o' || type == 'x' || type == 'X') {
14230 iobj = PyNumber_Index(v);
14231 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014232 if (PyErr_ExceptionMatches(PyExc_TypeError))
14233 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014234 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014235 }
14236 }
14237 else {
14238 iobj = PyNumber_Long(v);
14239 if (iobj == NULL ) {
14240 if (PyErr_ExceptionMatches(PyExc_TypeError))
14241 goto wrongtype;
14242 return -1;
14243 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014244 }
14245 assert(PyLong_Check(iobj));
14246 }
14247 else {
14248 iobj = v;
14249 Py_INCREF(iobj);
14250 }
14251
14252 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014253 && arg->width == -1 && arg->prec == -1
14254 && !(arg->flags & (F_SIGN | F_BLANK))
14255 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014256 {
14257 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014258 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014259 int base;
14260
Victor Stinnera47082312012-10-04 02:19:54 +020014261 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014262 {
14263 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014264 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014265 case 'd':
14266 case 'i':
14267 case 'u':
14268 base = 10;
14269 break;
14270 case 'o':
14271 base = 8;
14272 break;
14273 case 'x':
14274 case 'X':
14275 base = 16;
14276 break;
14277 }
14278
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014279 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14280 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014282 }
14283 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 return 1;
14285 }
14286
Ethan Furmanb95b5612015-01-23 20:05:18 -080014287 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 Py_DECREF(iobj);
14289 if (res == NULL)
14290 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014291 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014292 return 0;
14293
14294wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014295 switch(type)
14296 {
14297 case 'o':
14298 case 'x':
14299 case 'X':
14300 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014301 "%%%c format: an integer is required, "
14302 "not %.200s",
14303 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014304 break;
14305 default:
14306 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014307 "%%%c format: a number is required, "
14308 "not %.200s",
14309 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014310 break;
14311 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014312 return -1;
14313}
14314
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014315static Py_UCS4
14316formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014317{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014318 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014319 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014320 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014321 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014322 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014323 goto onError;
14324 }
14325 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014326 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014327 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014328 /* make sure number is a type of integer */
14329 if (!PyLong_Check(v)) {
14330 iobj = PyNumber_Index(v);
14331 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014332 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014333 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014334 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014335 Py_DECREF(iobj);
14336 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014337 else {
14338 x = PyLong_AsLong(v);
14339 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 if (x == -1 && PyErr_Occurred())
14341 goto onError;
14342
Victor Stinner8faf8212011-12-08 22:14:11 +010014343 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 PyErr_SetString(PyExc_OverflowError,
14345 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014346 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014347 }
14348
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014349 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014350 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014351
Benjamin Peterson29060642009-01-31 22:14:21 +000014352 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014353 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014354 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014355 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014356}
14357
Victor Stinnera47082312012-10-04 02:19:54 +020014358/* Parse options of an argument: flags, width, precision.
14359 Handle also "%(name)" syntax.
14360
14361 Return 0 if the argument has been formatted into arg->str.
14362 Return 1 if the argument has been written into ctx->writer,
14363 Raise an exception and return -1 on error. */
14364static int
14365unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14366 struct unicode_format_arg_t *arg)
14367{
14368#define FORMAT_READ(ctx) \
14369 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14370
14371 PyObject *v;
14372
Victor Stinnera47082312012-10-04 02:19:54 +020014373 if (arg->ch == '(') {
14374 /* Get argument value from a dictionary. Example: "%(name)s". */
14375 Py_ssize_t keystart;
14376 Py_ssize_t keylen;
14377 PyObject *key;
14378 int pcount = 1;
14379
14380 if (ctx->dict == NULL) {
14381 PyErr_SetString(PyExc_TypeError,
14382 "format requires a mapping");
14383 return -1;
14384 }
14385 ++ctx->fmtpos;
14386 --ctx->fmtcnt;
14387 keystart = ctx->fmtpos;
14388 /* Skip over balanced parentheses */
14389 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14390 arg->ch = FORMAT_READ(ctx);
14391 if (arg->ch == ')')
14392 --pcount;
14393 else if (arg->ch == '(')
14394 ++pcount;
14395 ctx->fmtpos++;
14396 }
14397 keylen = ctx->fmtpos - keystart - 1;
14398 if (ctx->fmtcnt < 0 || pcount > 0) {
14399 PyErr_SetString(PyExc_ValueError,
14400 "incomplete format key");
14401 return -1;
14402 }
14403 key = PyUnicode_Substring(ctx->fmtstr,
14404 keystart, keystart + keylen);
14405 if (key == NULL)
14406 return -1;
14407 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014408 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014409 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014410 }
14411 ctx->args = PyObject_GetItem(ctx->dict, key);
14412 Py_DECREF(key);
14413 if (ctx->args == NULL)
14414 return -1;
14415 ctx->args_owned = 1;
14416 ctx->arglen = -1;
14417 ctx->argidx = -2;
14418 }
14419
14420 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014421 while (--ctx->fmtcnt >= 0) {
14422 arg->ch = FORMAT_READ(ctx);
14423 ctx->fmtpos++;
14424 switch (arg->ch) {
14425 case '-': arg->flags |= F_LJUST; continue;
14426 case '+': arg->flags |= F_SIGN; continue;
14427 case ' ': arg->flags |= F_BLANK; continue;
14428 case '#': arg->flags |= F_ALT; continue;
14429 case '0': arg->flags |= F_ZERO; continue;
14430 }
14431 break;
14432 }
14433
14434 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014435 if (arg->ch == '*') {
14436 v = unicode_format_getnextarg(ctx);
14437 if (v == NULL)
14438 return -1;
14439 if (!PyLong_Check(v)) {
14440 PyErr_SetString(PyExc_TypeError,
14441 "* wants int");
14442 return -1;
14443 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014444 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014445 if (arg->width == -1 && PyErr_Occurred())
14446 return -1;
14447 if (arg->width < 0) {
14448 arg->flags |= F_LJUST;
14449 arg->width = -arg->width;
14450 }
14451 if (--ctx->fmtcnt >= 0) {
14452 arg->ch = FORMAT_READ(ctx);
14453 ctx->fmtpos++;
14454 }
14455 }
14456 else if (arg->ch >= '0' && arg->ch <= '9') {
14457 arg->width = arg->ch - '0';
14458 while (--ctx->fmtcnt >= 0) {
14459 arg->ch = FORMAT_READ(ctx);
14460 ctx->fmtpos++;
14461 if (arg->ch < '0' || arg->ch > '9')
14462 break;
14463 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14464 mixing signed and unsigned comparison. Since arg->ch is between
14465 '0' and '9', casting to int is safe. */
14466 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14467 PyErr_SetString(PyExc_ValueError,
14468 "width too big");
14469 return -1;
14470 }
14471 arg->width = arg->width*10 + (arg->ch - '0');
14472 }
14473 }
14474
14475 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014476 if (arg->ch == '.') {
14477 arg->prec = 0;
14478 if (--ctx->fmtcnt >= 0) {
14479 arg->ch = FORMAT_READ(ctx);
14480 ctx->fmtpos++;
14481 }
14482 if (arg->ch == '*') {
14483 v = unicode_format_getnextarg(ctx);
14484 if (v == NULL)
14485 return -1;
14486 if (!PyLong_Check(v)) {
14487 PyErr_SetString(PyExc_TypeError,
14488 "* wants int");
14489 return -1;
14490 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014491 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014492 if (arg->prec == -1 && PyErr_Occurred())
14493 return -1;
14494 if (arg->prec < 0)
14495 arg->prec = 0;
14496 if (--ctx->fmtcnt >= 0) {
14497 arg->ch = FORMAT_READ(ctx);
14498 ctx->fmtpos++;
14499 }
14500 }
14501 else if (arg->ch >= '0' && arg->ch <= '9') {
14502 arg->prec = arg->ch - '0';
14503 while (--ctx->fmtcnt >= 0) {
14504 arg->ch = FORMAT_READ(ctx);
14505 ctx->fmtpos++;
14506 if (arg->ch < '0' || arg->ch > '9')
14507 break;
14508 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14509 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014510 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014511 return -1;
14512 }
14513 arg->prec = arg->prec*10 + (arg->ch - '0');
14514 }
14515 }
14516 }
14517
14518 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14519 if (ctx->fmtcnt >= 0) {
14520 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14521 if (--ctx->fmtcnt >= 0) {
14522 arg->ch = FORMAT_READ(ctx);
14523 ctx->fmtpos++;
14524 }
14525 }
14526 }
14527 if (ctx->fmtcnt < 0) {
14528 PyErr_SetString(PyExc_ValueError,
14529 "incomplete format");
14530 return -1;
14531 }
14532 return 0;
14533
14534#undef FORMAT_READ
14535}
14536
14537/* Format one argument. Supported conversion specifiers:
14538
14539 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014540 - "i", "d", "u": int or float
14541 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014542 - "e", "E", "f", "F", "g", "G": float
14543 - "c": int or str (1 character)
14544
Victor Stinner8dbd4212012-12-04 09:30:24 +010014545 When possible, the output is written directly into the Unicode writer
14546 (ctx->writer). A string is created when padding is required.
14547
Victor Stinnera47082312012-10-04 02:19:54 +020014548 Return 0 if the argument has been formatted into *p_str,
14549 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014550 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014551static int
14552unicode_format_arg_format(struct unicode_formatter_t *ctx,
14553 struct unicode_format_arg_t *arg,
14554 PyObject **p_str)
14555{
14556 PyObject *v;
14557 _PyUnicodeWriter *writer = &ctx->writer;
14558
14559 if (ctx->fmtcnt == 0)
14560 ctx->writer.overallocate = 0;
14561
Victor Stinnera47082312012-10-04 02:19:54 +020014562 v = unicode_format_getnextarg(ctx);
14563 if (v == NULL)
14564 return -1;
14565
Victor Stinnera47082312012-10-04 02:19:54 +020014566
14567 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014568 case 's':
14569 case 'r':
14570 case 'a':
14571 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14572 /* Fast path */
14573 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14574 return -1;
14575 return 1;
14576 }
14577
14578 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14579 *p_str = v;
14580 Py_INCREF(*p_str);
14581 }
14582 else {
14583 if (arg->ch == 's')
14584 *p_str = PyObject_Str(v);
14585 else if (arg->ch == 'r')
14586 *p_str = PyObject_Repr(v);
14587 else
14588 *p_str = PyObject_ASCII(v);
14589 }
14590 break;
14591
14592 case 'i':
14593 case 'd':
14594 case 'u':
14595 case 'o':
14596 case 'x':
14597 case 'X':
14598 {
14599 int ret = mainformatlong(v, arg, p_str, writer);
14600 if (ret != 0)
14601 return ret;
14602 arg->sign = 1;
14603 break;
14604 }
14605
14606 case 'e':
14607 case 'E':
14608 case 'f':
14609 case 'F':
14610 case 'g':
14611 case 'G':
14612 if (arg->width == -1 && arg->prec == -1
14613 && !(arg->flags & (F_SIGN | F_BLANK)))
14614 {
14615 /* Fast path */
14616 if (formatfloat(v, arg, NULL, writer) == -1)
14617 return -1;
14618 return 1;
14619 }
14620
14621 arg->sign = 1;
14622 if (formatfloat(v, arg, p_str, NULL) == -1)
14623 return -1;
14624 break;
14625
14626 case 'c':
14627 {
14628 Py_UCS4 ch = formatchar(v);
14629 if (ch == (Py_UCS4) -1)
14630 return -1;
14631 if (arg->width == -1 && arg->prec == -1) {
14632 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014633 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014634 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014635 return 1;
14636 }
14637 *p_str = PyUnicode_FromOrdinal(ch);
14638 break;
14639 }
14640
14641 default:
14642 PyErr_Format(PyExc_ValueError,
14643 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014644 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014645 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14646 (int)arg->ch,
14647 ctx->fmtpos - 1);
14648 return -1;
14649 }
14650 if (*p_str == NULL)
14651 return -1;
14652 assert (PyUnicode_Check(*p_str));
14653 return 0;
14654}
14655
14656static int
14657unicode_format_arg_output(struct unicode_formatter_t *ctx,
14658 struct unicode_format_arg_t *arg,
14659 PyObject *str)
14660{
14661 Py_ssize_t len;
14662 enum PyUnicode_Kind kind;
14663 void *pbuf;
14664 Py_ssize_t pindex;
14665 Py_UCS4 signchar;
14666 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014667 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014668 Py_ssize_t sublen;
14669 _PyUnicodeWriter *writer = &ctx->writer;
14670 Py_UCS4 fill;
14671
14672 fill = ' ';
14673 if (arg->sign && arg->flags & F_ZERO)
14674 fill = '0';
14675
14676 if (PyUnicode_READY(str) == -1)
14677 return -1;
14678
14679 len = PyUnicode_GET_LENGTH(str);
14680 if ((arg->width == -1 || arg->width <= len)
14681 && (arg->prec == -1 || arg->prec >= len)
14682 && !(arg->flags & (F_SIGN | F_BLANK)))
14683 {
14684 /* Fast path */
14685 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14686 return -1;
14687 return 0;
14688 }
14689
14690 /* Truncate the string for "s", "r" and "a" formats
14691 if the precision is set */
14692 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14693 if (arg->prec >= 0 && len > arg->prec)
14694 len = arg->prec;
14695 }
14696
14697 /* Adjust sign and width */
14698 kind = PyUnicode_KIND(str);
14699 pbuf = PyUnicode_DATA(str);
14700 pindex = 0;
14701 signchar = '\0';
14702 if (arg->sign) {
14703 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14704 if (ch == '-' || ch == '+') {
14705 signchar = ch;
14706 len--;
14707 pindex++;
14708 }
14709 else if (arg->flags & F_SIGN)
14710 signchar = '+';
14711 else if (arg->flags & F_BLANK)
14712 signchar = ' ';
14713 else
14714 arg->sign = 0;
14715 }
14716 if (arg->width < len)
14717 arg->width = len;
14718
14719 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014720 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014721 if (!(arg->flags & F_LJUST)) {
14722 if (arg->sign) {
14723 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014724 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014725 }
14726 else {
14727 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014728 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014729 }
14730 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014731 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14732 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014733 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014734 }
14735
Victor Stinnera47082312012-10-04 02:19:54 +020014736 buflen = arg->width;
14737 if (arg->sign && len == arg->width)
14738 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014739 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014740 return -1;
14741
14742 /* Write the sign if needed */
14743 if (arg->sign) {
14744 if (fill != ' ') {
14745 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14746 writer->pos += 1;
14747 }
14748 if (arg->width > len)
14749 arg->width--;
14750 }
14751
14752 /* Write the numeric prefix for "x", "X" and "o" formats
14753 if the alternate form is used.
14754 For example, write "0x" for the "%#x" format. */
14755 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14756 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14757 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14758 if (fill != ' ') {
14759 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14760 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14761 writer->pos += 2;
14762 pindex += 2;
14763 }
14764 arg->width -= 2;
14765 if (arg->width < 0)
14766 arg->width = 0;
14767 len -= 2;
14768 }
14769
14770 /* Pad left with the fill character if needed */
14771 if (arg->width > len && !(arg->flags & F_LJUST)) {
14772 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014773 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014774 writer->pos += sublen;
14775 arg->width = len;
14776 }
14777
14778 /* If padding with spaces: write sign if needed and/or numeric prefix if
14779 the alternate form is used */
14780 if (fill == ' ') {
14781 if (arg->sign) {
14782 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14783 writer->pos += 1;
14784 }
14785 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14786 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14787 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14788 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14789 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14790 writer->pos += 2;
14791 pindex += 2;
14792 }
14793 }
14794
14795 /* Write characters */
14796 if (len) {
14797 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14798 str, pindex, len);
14799 writer->pos += len;
14800 }
14801
14802 /* Pad right with the fill character if needed */
14803 if (arg->width > len) {
14804 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014805 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 writer->pos += sublen;
14807 }
14808 return 0;
14809}
14810
14811/* Helper of PyUnicode_Format(): format one arg.
14812 Return 0 on success, raise an exception and return -1 on error. */
14813static int
14814unicode_format_arg(struct unicode_formatter_t *ctx)
14815{
14816 struct unicode_format_arg_t arg;
14817 PyObject *str;
14818 int ret;
14819
Victor Stinner8dbd4212012-12-04 09:30:24 +010014820 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014821 if (arg.ch == '%') {
14822 ctx->fmtpos++;
14823 ctx->fmtcnt--;
14824 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14825 return -1;
14826 return 0;
14827 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014828 arg.flags = 0;
14829 arg.width = -1;
14830 arg.prec = -1;
14831 arg.sign = 0;
14832 str = NULL;
14833
Victor Stinnera47082312012-10-04 02:19:54 +020014834 ret = unicode_format_arg_parse(ctx, &arg);
14835 if (ret == -1)
14836 return -1;
14837
14838 ret = unicode_format_arg_format(ctx, &arg, &str);
14839 if (ret == -1)
14840 return -1;
14841
14842 if (ret != 1) {
14843 ret = unicode_format_arg_output(ctx, &arg, str);
14844 Py_DECREF(str);
14845 if (ret == -1)
14846 return -1;
14847 }
14848
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014849 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014850 PyErr_SetString(PyExc_TypeError,
14851 "not all arguments converted during string formatting");
14852 return -1;
14853 }
14854 return 0;
14855}
14856
Alexander Belopolsky40018472011-02-26 01:02:56 +000014857PyObject *
14858PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014859{
Victor Stinnera47082312012-10-04 02:19:54 +020014860 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014861
Guido van Rossumd57fd912000-03-10 22:53:23 +000014862 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014863 PyErr_BadInternalCall();
14864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014865 }
Victor Stinnera47082312012-10-04 02:19:54 +020014866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014867 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014868 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014869
14870 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014871 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14872 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14873 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14874 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014875
Victor Stinner8f674cc2013-04-17 23:02:17 +020014876 _PyUnicodeWriter_Init(&ctx.writer);
14877 ctx.writer.min_length = ctx.fmtcnt + 100;
14878 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014879
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014881 ctx.arglen = PyTuple_Size(args);
14882 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014883 }
14884 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014885 ctx.arglen = -1;
14886 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014887 }
Victor Stinnera47082312012-10-04 02:19:54 +020014888 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014889 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014890 ctx.dict = args;
14891 else
14892 ctx.dict = NULL;
14893 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014894
Victor Stinnera47082312012-10-04 02:19:54 +020014895 while (--ctx.fmtcnt >= 0) {
14896 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014897 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014898
14899 nonfmtpos = ctx.fmtpos++;
14900 while (ctx.fmtcnt >= 0 &&
14901 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14902 ctx.fmtpos++;
14903 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014904 }
Victor Stinnera47082312012-10-04 02:19:54 +020014905 if (ctx.fmtcnt < 0) {
14906 ctx.fmtpos--;
14907 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014908 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014909
Victor Stinnercfc4c132013-04-03 01:48:39 +020014910 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14911 nonfmtpos, ctx.fmtpos) < 0)
14912 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014913 }
14914 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014915 ctx.fmtpos++;
14916 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014917 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014918 }
14919 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014920
Victor Stinnera47082312012-10-04 02:19:54 +020014921 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014922 PyErr_SetString(PyExc_TypeError,
14923 "not all arguments converted during string formatting");
14924 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014925 }
14926
Victor Stinnera47082312012-10-04 02:19:54 +020014927 if (ctx.args_owned) {
14928 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014929 }
Victor Stinnera47082312012-10-04 02:19:54 +020014930 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014933 _PyUnicodeWriter_Dealloc(&ctx.writer);
14934 if (ctx.args_owned) {
14935 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014936 }
14937 return NULL;
14938}
14939
Jeremy Hylton938ace62002-07-17 16:30:39 +000014940static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014941unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14942
Tim Peters6d6c1a32001-08-02 04:15:00 +000014943static PyObject *
14944unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14945{
Benjamin Peterson29060642009-01-31 22:14:21 +000014946 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014947 static char *kwlist[] = {"object", "encoding", "errors", 0};
14948 char *encoding = NULL;
14949 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014950
Benjamin Peterson14339b62009-01-31 16:36:08 +000014951 if (type != &PyUnicode_Type)
14952 return unicode_subtype_new(type, args, kwds);
14953 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014954 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014955 return NULL;
14956 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014957 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 if (encoding == NULL && errors == NULL)
14959 return PyObject_Str(x);
14960 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014961 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014962}
14963
Guido van Rossume023fe02001-08-30 03:12:59 +000014964static PyObject *
14965unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14966{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014967 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014968 Py_ssize_t length, char_size;
14969 int share_wstr, share_utf8;
14970 unsigned int kind;
14971 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014972
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014974
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014975 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014976 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014978 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014979 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014980 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014981 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014982 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014983
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014984 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014985 if (self == NULL) {
14986 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 return NULL;
14988 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014989 kind = PyUnicode_KIND(unicode);
14990 length = PyUnicode_GET_LENGTH(unicode);
14991
14992 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014993#ifdef Py_DEBUG
14994 _PyUnicode_HASH(self) = -1;
14995#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014996 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014997#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014998 _PyUnicode_STATE(self).interned = 0;
14999 _PyUnicode_STATE(self).kind = kind;
15000 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015001 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015002 _PyUnicode_STATE(self).ready = 1;
15003 _PyUnicode_WSTR(self) = NULL;
15004 _PyUnicode_UTF8_LENGTH(self) = 0;
15005 _PyUnicode_UTF8(self) = NULL;
15006 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015007 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015008
15009 share_utf8 = 0;
15010 share_wstr = 0;
15011 if (kind == PyUnicode_1BYTE_KIND) {
15012 char_size = 1;
15013 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15014 share_utf8 = 1;
15015 }
15016 else if (kind == PyUnicode_2BYTE_KIND) {
15017 char_size = 2;
15018 if (sizeof(wchar_t) == 2)
15019 share_wstr = 1;
15020 }
15021 else {
15022 assert(kind == PyUnicode_4BYTE_KIND);
15023 char_size = 4;
15024 if (sizeof(wchar_t) == 4)
15025 share_wstr = 1;
15026 }
15027
15028 /* Ensure we won't overflow the length. */
15029 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15030 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015031 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033 data = PyObject_MALLOC((length + 1) * char_size);
15034 if (data == NULL) {
15035 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015036 goto onError;
15037 }
15038
Victor Stinnerc3c74152011-10-02 20:39:55 +020015039 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015040 if (share_utf8) {
15041 _PyUnicode_UTF8_LENGTH(self) = length;
15042 _PyUnicode_UTF8(self) = data;
15043 }
15044 if (share_wstr) {
15045 _PyUnicode_WSTR_LENGTH(self) = length;
15046 _PyUnicode_WSTR(self) = (wchar_t *)data;
15047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015048
Christian Heimesf051e432016-09-13 20:22:02 +020015049 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015050 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015051 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015052#ifdef Py_DEBUG
15053 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15054#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015055 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015056 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057
15058onError:
15059 Py_DECREF(unicode);
15060 Py_DECREF(self);
15061 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015062}
15063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015064PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015065"str(object='') -> str\n\
15066str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015067\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015068Create a new string object from the given object. If encoding or\n\
15069errors is specified, then the object must expose a data buffer\n\
15070that will be decoded using the given encoding and error handler.\n\
15071Otherwise, returns the result of object.__str__() (if defined)\n\
15072or repr(object).\n\
15073encoding defaults to sys.getdefaultencoding().\n\
15074errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015075
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015076static PyObject *unicode_iter(PyObject *seq);
15077
Guido van Rossumd57fd912000-03-10 22:53:23 +000015078PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015079 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015080 "str", /* tp_name */
15081 sizeof(PyUnicodeObject), /* tp_basicsize */
15082 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015083 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015084 (destructor)unicode_dealloc, /* tp_dealloc */
15085 0, /* tp_print */
15086 0, /* tp_getattr */
15087 0, /* tp_setattr */
15088 0, /* tp_reserved */
15089 unicode_repr, /* tp_repr */
15090 &unicode_as_number, /* tp_as_number */
15091 &unicode_as_sequence, /* tp_as_sequence */
15092 &unicode_as_mapping, /* tp_as_mapping */
15093 (hashfunc) unicode_hash, /* tp_hash*/
15094 0, /* tp_call*/
15095 (reprfunc) unicode_str, /* tp_str */
15096 PyObject_GenericGetAttr, /* tp_getattro */
15097 0, /* tp_setattro */
15098 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015100 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15101 unicode_doc, /* tp_doc */
15102 0, /* tp_traverse */
15103 0, /* tp_clear */
15104 PyUnicode_RichCompare, /* tp_richcompare */
15105 0, /* tp_weaklistoffset */
15106 unicode_iter, /* tp_iter */
15107 0, /* tp_iternext */
15108 unicode_methods, /* tp_methods */
15109 0, /* tp_members */
15110 0, /* tp_getset */
15111 &PyBaseObject_Type, /* tp_base */
15112 0, /* tp_dict */
15113 0, /* tp_descr_get */
15114 0, /* tp_descr_set */
15115 0, /* tp_dictoffset */
15116 0, /* tp_init */
15117 0, /* tp_alloc */
15118 unicode_new, /* tp_new */
15119 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015120};
15121
15122/* Initialize the Unicode implementation */
15123
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015124_PyInitError
15125_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015126{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015127 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015128 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015129 0x000A, /* LINE FEED */
15130 0x000D, /* CARRIAGE RETURN */
15131 0x001C, /* FILE SEPARATOR */
15132 0x001D, /* GROUP SEPARATOR */
15133 0x001E, /* RECORD SEPARATOR */
15134 0x0085, /* NEXT LINE */
15135 0x2028, /* LINE SEPARATOR */
15136 0x2029, /* PARAGRAPH SEPARATOR */
15137 };
15138
Fred Drakee4315f52000-05-09 19:53:39 +000015139 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015140 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015141 if (!unicode_empty) {
15142 return _Py_INIT_ERR("Can't create empty string");
15143 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015144 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015145
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015146 if (PyType_Ready(&PyUnicode_Type) < 0) {
15147 return _Py_INIT_ERR("Can't initialize unicode type");
15148 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015149
15150 /* initialize the linebreak bloom filter */
15151 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015152 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015153 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015154
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015155 if (PyType_Ready(&EncodingMapType) < 0) {
15156 return _Py_INIT_ERR("Can't initialize encoding map type");
15157 }
15158 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15159 return _Py_INIT_ERR("Can't initialize field name iterator type");
15160 }
15161 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15162 return _Py_INIT_ERR("Can't initialize formatter iter type");
15163 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015164 return _Py_INIT_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015165}
15166
15167/* Finalize the Unicode implementation */
15168
Christian Heimesa156e092008-02-16 07:38:31 +000015169int
15170PyUnicode_ClearFreeList(void)
15171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015173}
15174
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015175
Walter Dörwald16807132007-05-25 13:52:07 +000015176void
15177PyUnicode_InternInPlace(PyObject **p)
15178{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015179 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015181#ifdef Py_DEBUG
15182 assert(s != NULL);
15183 assert(_PyUnicode_CHECK(s));
15184#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015186 return;
15187#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 /* If it's a subclass, we don't really know what putting
15189 it in the interned dict might do. */
15190 if (!PyUnicode_CheckExact(s))
15191 return;
15192 if (PyUnicode_CHECK_INTERNED(s))
15193 return;
15194 if (interned == NULL) {
15195 interned = PyDict_New();
15196 if (interned == NULL) {
15197 PyErr_Clear(); /* Don't leave an exception */
15198 return;
15199 }
15200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015202 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015204 if (t == NULL) {
15205 PyErr_Clear();
15206 return;
15207 }
15208 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015209 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015210 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015211 return;
15212 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 /* The two references in interned are not counted by refcnt.
15214 The deallocator will take care of this */
15215 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015216 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015217}
15218
15219void
15220PyUnicode_InternImmortal(PyObject **p)
15221{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015222 PyUnicode_InternInPlace(p);
15223 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015224 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 Py_INCREF(*p);
15226 }
Walter Dörwald16807132007-05-25 13:52:07 +000015227}
15228
15229PyObject *
15230PyUnicode_InternFromString(const char *cp)
15231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 PyObject *s = PyUnicode_FromString(cp);
15233 if (s == NULL)
15234 return NULL;
15235 PyUnicode_InternInPlace(&s);
15236 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015237}
15238
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015239
15240#if defined(WITH_VALGRIND) || defined(__INSURE__)
15241static void
15242unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015243{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015245 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 Py_ssize_t i, n;
15247 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015248
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 if (interned == NULL || !PyDict_Check(interned))
15250 return;
15251 keys = PyDict_Keys(interned);
15252 if (keys == NULL || !PyList_Check(keys)) {
15253 PyErr_Clear();
15254 return;
15255 }
Walter Dörwald16807132007-05-25 13:52:07 +000015256
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015257 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 detector, interned unicode strings are not forcibly deallocated;
15259 rather, we give them their stolen references back, and then clear
15260 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015261
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015263#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015265 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015266#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015268 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015269 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015270 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015272 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 case SSTATE_NOT_INTERNED:
15274 /* XXX Shouldn't happen */
15275 break;
15276 case SSTATE_INTERNED_IMMORTAL:
15277 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015278 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 break;
15280 case SSTATE_INTERNED_MORTAL:
15281 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015282 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 break;
15284 default:
15285 Py_FatalError("Inconsistent interned string state.");
15286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015287 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015289#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 fprintf(stderr, "total size of all interned strings: "
15291 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15292 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015293#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 Py_DECREF(keys);
15295 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015296 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015297}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015298#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015299
15300
15301/********************* Unicode Iterator **************************/
15302
15303typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 PyObject_HEAD
15305 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015306 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015307} unicodeiterobject;
15308
15309static void
15310unicodeiter_dealloc(unicodeiterobject *it)
15311{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 _PyObject_GC_UNTRACK(it);
15313 Py_XDECREF(it->it_seq);
15314 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015315}
15316
15317static int
15318unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 Py_VISIT(it->it_seq);
15321 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015322}
15323
15324static PyObject *
15325unicodeiter_next(unicodeiterobject *it)
15326{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015327 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015328
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 assert(it != NULL);
15330 seq = it->it_seq;
15331 if (seq == NULL)
15332 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015333 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015335 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15336 int kind = PyUnicode_KIND(seq);
15337 void *data = PyUnicode_DATA(seq);
15338 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15339 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 if (item != NULL)
15341 ++it->it_index;
15342 return item;
15343 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015344
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015346 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015348}
15349
15350static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015351unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015352{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 Py_ssize_t len = 0;
15354 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015355 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015357}
15358
15359PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15360
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015361static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015362unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015363{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015364 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015365 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015366 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015367 it->it_seq, it->it_index);
15368 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015369 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015370 if (u == NULL)
15371 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015372 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015373 }
15374}
15375
15376PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15377
15378static PyObject *
15379unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15380{
15381 Py_ssize_t index = PyLong_AsSsize_t(state);
15382 if (index == -1 && PyErr_Occurred())
15383 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015384 if (it->it_seq != NULL) {
15385 if (index < 0)
15386 index = 0;
15387 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15388 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15389 it->it_index = index;
15390 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015391 Py_RETURN_NONE;
15392}
15393
15394PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15395
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015396static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015398 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015399 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15400 reduce_doc},
15401 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15402 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015404};
15405
15406PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15408 "str_iterator", /* tp_name */
15409 sizeof(unicodeiterobject), /* tp_basicsize */
15410 0, /* tp_itemsize */
15411 /* methods */
15412 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15413 0, /* tp_print */
15414 0, /* tp_getattr */
15415 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015416 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 0, /* tp_repr */
15418 0, /* tp_as_number */
15419 0, /* tp_as_sequence */
15420 0, /* tp_as_mapping */
15421 0, /* tp_hash */
15422 0, /* tp_call */
15423 0, /* tp_str */
15424 PyObject_GenericGetAttr, /* tp_getattro */
15425 0, /* tp_setattro */
15426 0, /* tp_as_buffer */
15427 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15428 0, /* tp_doc */
15429 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15430 0, /* tp_clear */
15431 0, /* tp_richcompare */
15432 0, /* tp_weaklistoffset */
15433 PyObject_SelfIter, /* tp_iter */
15434 (iternextfunc)unicodeiter_next, /* tp_iternext */
15435 unicodeiter_methods, /* tp_methods */
15436 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015437};
15438
15439static PyObject *
15440unicode_iter(PyObject *seq)
15441{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015443
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 if (!PyUnicode_Check(seq)) {
15445 PyErr_BadInternalCall();
15446 return NULL;
15447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015448 if (PyUnicode_READY(seq) == -1)
15449 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15451 if (it == NULL)
15452 return NULL;
15453 it->it_index = 0;
15454 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015455 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015456 _PyObject_GC_TRACK(it);
15457 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015458}
15459
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015460
15461size_t
15462Py_UNICODE_strlen(const Py_UNICODE *u)
15463{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015464 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015465}
15466
15467Py_UNICODE*
15468Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15469{
15470 Py_UNICODE *u = s1;
15471 while ((*u++ = *s2++));
15472 return s1;
15473}
15474
15475Py_UNICODE*
15476Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15477{
15478 Py_UNICODE *u = s1;
15479 while ((*u++ = *s2++))
15480 if (n-- == 0)
15481 break;
15482 return s1;
15483}
15484
15485Py_UNICODE*
15486Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15487{
15488 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015489 u1 += wcslen(u1);
15490 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015491 return s1;
15492}
15493
15494int
15495Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15496{
15497 while (*s1 && *s2 && *s1 == *s2)
15498 s1++, s2++;
15499 if (*s1 && *s2)
15500 return (*s1 < *s2) ? -1 : +1;
15501 if (*s1)
15502 return 1;
15503 if (*s2)
15504 return -1;
15505 return 0;
15506}
15507
15508int
15509Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15510{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015511 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015512 for (; n != 0; n--) {
15513 u1 = *s1;
15514 u2 = *s2;
15515 if (u1 != u2)
15516 return (u1 < u2) ? -1 : +1;
15517 if (u1 == '\0')
15518 return 0;
15519 s1++;
15520 s2++;
15521 }
15522 return 0;
15523}
15524
15525Py_UNICODE*
15526Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15527{
15528 const Py_UNICODE *p;
15529 for (p = s; *p; p++)
15530 if (*p == c)
15531 return (Py_UNICODE*)p;
15532 return NULL;
15533}
15534
15535Py_UNICODE*
15536Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15537{
15538 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015539 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015540 while (p != s) {
15541 p--;
15542 if (*p == c)
15543 return (Py_UNICODE*)p;
15544 }
15545 return NULL;
15546}
Victor Stinner331ea922010-08-10 16:37:20 +000015547
Victor Stinner71133ff2010-09-01 23:43:53 +000015548Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015549PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015550{
Victor Stinner577db2c2011-10-11 22:12:48 +020015551 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015552 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015554 if (!PyUnicode_Check(unicode)) {
15555 PyErr_BadArgument();
15556 return NULL;
15557 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015558 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015559 if (u == NULL)
15560 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015561 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015562 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015563 PyErr_NoMemory();
15564 return NULL;
15565 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015566 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015567 size *= sizeof(Py_UNICODE);
15568 copy = PyMem_Malloc(size);
15569 if (copy == NULL) {
15570 PyErr_NoMemory();
15571 return NULL;
15572 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015573 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015574 return copy;
15575}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015576
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015577
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015578static char*
15579get_codec_name(const char *encoding)
15580{
15581 PyObject *codec, *name_obj = NULL;
15582
15583 codec = _PyCodec_Lookup(encoding);
15584 if (!codec)
15585 goto error;
15586
15587 name_obj = PyObject_GetAttrString(codec, "name");
15588 Py_CLEAR(codec);
15589 if (!name_obj) {
15590 goto error;
15591 }
15592
15593 const char *name_utf8 = PyUnicode_AsUTF8(name_obj);
15594 if (name_utf8 == NULL) {
15595 goto error;
15596 }
15597
15598 char *name = _PyMem_RawStrdup(name_utf8);
15599 Py_DECREF(name_obj);
15600 if (name == NULL) {
15601 PyErr_NoMemory();
15602 return NULL;
15603 }
15604 return name;
15605
15606error:
15607 Py_XDECREF(codec);
15608 Py_XDECREF(name_obj);
15609 return NULL;
15610}
15611
15612
15613static _PyInitError
15614init_stdio_encoding(PyInterpreterState *interp)
15615{
15616 _PyCoreConfig *config = &interp->core_config;
15617
15618 char *codec_name = get_codec_name(config->stdio_encoding);
15619 if (codec_name == NULL) {
15620 return _Py_INIT_ERR("failed to get the Python codec name "
15621 "of the stdio encoding");
15622 }
15623 PyMem_RawFree(config->stdio_encoding);
15624 config->stdio_encoding = codec_name;
15625 return _Py_INIT_OK();
15626}
15627
15628
15629static _PyInitError
15630init_fs_encoding(PyInterpreterState *interp)
15631{
15632 _PyCoreConfig *config = &interp->core_config;
15633
15634 char *encoding = get_codec_name(config->filesystem_encoding);
15635 if (encoding == NULL) {
15636 /* Such error can only occurs in critical situations: no more
15637 memory, import a module of the standard library failed, etc. */
15638 return _Py_INIT_ERR("failed to get the Python codec "
15639 "of the filesystem encoding");
15640 }
15641
15642 /* Update the filesystem encoding to the normalized Python codec name.
15643 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15644 (Python codec name). */
15645 PyMem_RawFree(config->filesystem_encoding);
15646 config->filesystem_encoding = encoding;
15647
15648 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15649 global configuration variables. */
15650 if (_Py_SetFileSystemEncoding(config->filesystem_encoding,
15651 config->filesystem_errors) < 0) {
15652 return _Py_INIT_NO_MEMORY();
15653 }
15654
15655 /* PyUnicode can now use the Python codec rather than C implementation
15656 for the filesystem encoding */
15657 interp->fscodec_initialized = 1;
15658 return _Py_INIT_OK();
15659}
15660
15661
15662_PyInitError
15663_PyUnicode_InitEncodings(PyInterpreterState *interp)
15664{
15665 _PyInitError err = init_fs_encoding(interp);
15666 if (_Py_INIT_FAILED(err)) {
15667 return err;
15668 }
15669
15670 return init_stdio_encoding(interp);
15671}
15672
15673
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015674void
15675_PyUnicode_Fini(void)
15676{
15677#if defined(WITH_VALGRIND) || defined(__INSURE__)
15678 /* Insure++ is a memory analysis tool that aids in discovering
15679 * memory leaks and other memory problems. On Python exit, the
15680 * interned string dictionaries are flagged as being in use at exit
15681 * (which it is). Under normal circumstances, this is fine because
15682 * the memory will be automatically reclaimed by the system. Under
15683 * memory debugging, it's a huge source of useless noise, so we
15684 * trade off slower shutdown for less distraction in the memory
15685 * reports. -baw
15686 */
15687 unicode_release_interned();
15688#endif /* __INSURE__ */
15689
15690 Py_CLEAR(unicode_empty);
15691
15692 for (Py_ssize_t i = 0; i < 256; i++) {
15693 Py_CLEAR(unicode_latin1[i]);
15694 }
15695 _PyUnicode_ClearStaticStrings();
15696 (void)PyUnicode_ClearFreeList();
15697}
15698
15699
Georg Brandl66c221e2010-10-14 07:04:07 +000015700/* A _string module, to export formatter_parser and formatter_field_name_split
15701 to the string.Formatter class implemented in Python. */
15702
15703static PyMethodDef _string_methods[] = {
15704 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15705 METH_O, PyDoc_STR("split the argument as a field name")},
15706 {"formatter_parser", (PyCFunction) formatter_parser,
15707 METH_O, PyDoc_STR("parse the argument as a format string")},
15708 {NULL, NULL}
15709};
15710
15711static struct PyModuleDef _string_module = {
15712 PyModuleDef_HEAD_INIT,
15713 "_string",
15714 PyDoc_STR("string helper module"),
15715 0,
15716 _string_methods,
15717 NULL,
15718 NULL,
15719 NULL,
15720 NULL
15721};
15722
15723PyMODINIT_FUNC
15724PyInit__string(void)
15725{
15726 return PyModule_Create(&_string_module);
15727}
15728
15729
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015730#ifdef __cplusplus
15731}
15732#endif