blob: c4d93fca0451a3545a0fa17c7c2f4c227d9f8c95 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner50149202015-09-22 00:26:54 +0200321typedef enum {
322 _Py_ERROR_UNKNOWN=0,
323 _Py_ERROR_STRICT,
324 _Py_ERROR_SURROGATEESCAPE,
325 _Py_ERROR_REPLACE,
326 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200327 _Py_ERROR_BACKSLASHREPLACE,
328 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200329 _Py_ERROR_XMLCHARREFREPLACE,
330 _Py_ERROR_OTHER
331} _Py_error_handler;
332
333static _Py_error_handler
334get_error_handler(const char *errors)
335{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200337 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
345 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200346 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200347 }
348 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200349 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200350 }
351 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200353 }
354 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200355 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200356 }
Victor Stinner50149202015-09-22 00:26:54 +0200357 return _Py_ERROR_OTHER;
358}
359
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300360/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
361 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000362Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000363PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000364{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000365#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000367#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 /* This is actually an illegal character, so it should
369 not be passed to unichr. */
370 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000371#endif
372}
373
Victor Stinner910337b2011-10-03 03:20:16 +0200374#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200375int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100376_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200377{
378 PyASCIIObject *ascii;
379 unsigned int kind;
380
381 assert(PyUnicode_Check(op));
382
383 ascii = (PyASCIIObject *)op;
384 kind = ascii->state.kind;
385
Victor Stinnera3b334d2011-10-03 13:53:37 +0200386 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
389 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200391 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200392 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200393
Victor Stinnera41463c2011-10-04 01:05:08 +0200394 if (ascii->state.compact == 1) {
395 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200396 assert(kind == PyUnicode_1BYTE_KIND
397 || kind == PyUnicode_2BYTE_KIND
398 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200399 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200400 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200401 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100402 }
403 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200404 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
405
406 data = unicode->data.any;
407 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100408 assert(ascii->length == 0);
409 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ascii == 0);
412 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100413 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200414 assert(ascii->wstr != NULL);
415 assert(data == NULL);
416 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 }
418 else {
419 assert(kind == PyUnicode_1BYTE_KIND
420 || kind == PyUnicode_2BYTE_KIND
421 || kind == PyUnicode_4BYTE_KIND);
422 assert(ascii->state.compact == 0);
423 assert(ascii->state.ready == 1);
424 assert(data != NULL);
425 if (ascii->state.ascii) {
426 assert (compact->utf8 == data);
427 assert (compact->utf8_length == ascii->length);
428 }
429 else
430 assert (compact->utf8 != data);
431 }
432 }
433 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200434 if (
435#if SIZEOF_WCHAR_T == 2
436 kind == PyUnicode_2BYTE_KIND
437#else
438 kind == PyUnicode_4BYTE_KIND
439#endif
440 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200441 {
442 assert(ascii->wstr == data);
443 assert(compact->wstr_length == ascii->length);
444 } else
445 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200446 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200447
448 if (compact->utf8 == NULL)
449 assert(compact->utf8_length == 0);
450 if (ascii->wstr == NULL)
451 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200452 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 /* check that the best kind is used */
454 if (check_content && kind != PyUnicode_WCHAR_KIND)
455 {
456 Py_ssize_t i;
457 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200458 void *data;
459 Py_UCS4 ch;
460
461 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200462 for (i=0; i < ascii->length; i++)
463 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200464 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 if (ch > maxchar)
466 maxchar = ch;
467 }
468 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100469 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200470 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 assert(maxchar <= 255);
472 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 else
474 assert(maxchar < 128);
475 }
Victor Stinner77faf692011-11-20 18:56:05 +0100476 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100478 assert(maxchar <= 0xFFFF);
479 }
480 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200481 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100482 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100483 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200484 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200485 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400486 return 1;
487}
Victor Stinner910337b2011-10-03 03:20:16 +0200488#endif
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490static PyObject*
491unicode_result_wchar(PyObject *unicode)
492{
493#ifndef Py_DEBUG
494 Py_ssize_t len;
495
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 len = _PyUnicode_WSTR_LENGTH(unicode);
497 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100498 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100500 }
501
502 if (len == 1) {
503 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100504 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100505 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
506 Py_DECREF(unicode);
507 return latin1_char;
508 }
509 }
510
511 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200512 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100513 return NULL;
514 }
515#else
Victor Stinneraa771272012-10-04 02:32:58 +0200516 assert(Py_REFCNT(unicode) == 1);
517
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 /* don't make the result ready in debug mode to ensure that the caller
519 makes the string ready before using it */
520 assert(_PyUnicode_CheckConsistency(unicode, 1));
521#endif
522 return unicode;
523}
524
525static PyObject*
526unicode_result_ready(PyObject *unicode)
527{
528 Py_ssize_t length;
529
530 length = PyUnicode_GET_LENGTH(unicode);
531 if (length == 0) {
532 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 }
536 return unicode_empty;
537 }
538
539 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200540 void *data = PyUnicode_DATA(unicode);
541 int kind = PyUnicode_KIND(unicode);
542 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 if (ch < 256) {
544 PyObject *latin1_char = unicode_latin1[ch];
545 if (latin1_char != NULL) {
546 if (unicode != latin1_char) {
547 Py_INCREF(latin1_char);
548 Py_DECREF(unicode);
549 }
550 return latin1_char;
551 }
552 else {
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 Py_INCREF(unicode);
555 unicode_latin1[ch] = unicode;
556 return unicode;
557 }
558 }
559 }
560
561 assert(_PyUnicode_CheckConsistency(unicode, 1));
562 return unicode;
563}
564
565static PyObject*
566unicode_result(PyObject *unicode)
567{
568 assert(_PyUnicode_CHECK(unicode));
569 if (PyUnicode_IS_READY(unicode))
570 return unicode_result_ready(unicode);
571 else
572 return unicode_result_wchar(unicode);
573}
574
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575static PyObject*
576unicode_result_unchanged(PyObject *unicode)
577{
578 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500579 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100580 return NULL;
581 Py_INCREF(unicode);
582 return unicode;
583 }
584 else
585 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100586 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100587}
588
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200589/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
590 ASCII, Latin1, UTF-8, etc. */
591static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200592backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
594{
Victor Stinnerad771582015-10-09 12:38:53 +0200595 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200596 Py_UCS4 ch;
597 enum PyUnicode_Kind kind;
598 void *data;
599
600 assert(PyUnicode_IS_READY(unicode));
601 kind = PyUnicode_KIND(unicode);
602 data = PyUnicode_DATA(unicode);
603
604 size = 0;
605 /* determine replacement size */
606 for (i = collstart; i < collend; ++i) {
607 Py_ssize_t incr;
608
609 ch = PyUnicode_READ(kind, data, i);
610 if (ch < 0x100)
611 incr = 2+2;
612 else if (ch < 0x10000)
613 incr = 2+4;
614 else {
615 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200616 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617 }
618 if (size > PY_SSIZE_T_MAX - incr) {
619 PyErr_SetString(PyExc_OverflowError,
620 "encoded result is too long for a Python string");
621 return NULL;
622 }
623 size += incr;
624 }
625
Victor Stinnerad771582015-10-09 12:38:53 +0200626 str = _PyBytesWriter_Prepare(writer, str, size);
627 if (str == NULL)
628 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629
630 /* generate replacement */
631 for (i = collstart; i < collend; ++i) {
632 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200633 *str++ = '\\';
634 if (ch >= 0x00010000) {
635 *str++ = 'U';
636 *str++ = Py_hexdigits[(ch>>28)&0xf];
637 *str++ = Py_hexdigits[(ch>>24)&0xf];
638 *str++ = Py_hexdigits[(ch>>20)&0xf];
639 *str++ = Py_hexdigits[(ch>>16)&0xf];
640 *str++ = Py_hexdigits[(ch>>12)&0xf];
641 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200642 }
Victor Stinner797485e2015-10-09 03:17:30 +0200643 else if (ch >= 0x100) {
644 *str++ = 'u';
645 *str++ = Py_hexdigits[(ch>>12)&0xf];
646 *str++ = Py_hexdigits[(ch>>8)&0xf];
647 }
648 else
649 *str++ = 'x';
650 *str++ = Py_hexdigits[(ch>>4)&0xf];
651 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200652 }
653 return str;
654}
655
656/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 10)
678 incr = 2+1+1;
679 else if (ch < 100)
680 incr = 2+2+1;
681 else if (ch < 1000)
682 incr = 2+3+1;
683 else if (ch < 10000)
684 incr = 2+4+1;
685 else if (ch < 100000)
686 incr = 2+5+1;
687 else if (ch < 1000000)
688 incr = 2+6+1;
689 else {
690 assert(ch <= MAX_UNICODE);
691 incr = 2+7+1;
692 }
693 if (size > PY_SSIZE_T_MAX - incr) {
694 PyErr_SetString(PyExc_OverflowError,
695 "encoded result is too long for a Python string");
696 return NULL;
697 }
698 size += incr;
699 }
700
Victor Stinnerad771582015-10-09 12:38:53 +0200701 str = _PyBytesWriter_Prepare(writer, str, size);
702 if (str == NULL)
703 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704
705 /* generate replacement */
706 for (i = collstart; i < collend; ++i) {
707 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
708 }
709 return str;
710}
711
Thomas Wouters477c8d52006-05-27 19:21:47 +0000712/* --- Bloom Filters ----------------------------------------------------- */
713
714/* stuff to implement simple "bloom filters" for Unicode characters.
715 to keep things simple, we use a single bitmask, using the least 5
716 bits from each unicode characters as the bit index. */
717
718/* the linebreak mask is set up by Unicode_Init below */
719
Antoine Pitrouf068f942010-01-13 14:19:12 +0000720#if LONG_BIT >= 128
721#define BLOOM_WIDTH 128
722#elif LONG_BIT >= 64
723#define BLOOM_WIDTH 64
724#elif LONG_BIT >= 32
725#define BLOOM_WIDTH 32
726#else
727#error "LONG_BIT is smaller than 32"
728#endif
729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730#define BLOOM_MASK unsigned long
731
Serhiy Storchaka05997252013-01-26 12:14:02 +0200732static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Antoine Pitrouf068f942010-01-13 14:19:12 +0000734#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000735
Benjamin Peterson29060642009-01-31 22:14:21 +0000736#define BLOOM_LINEBREAK(ch) \
737 ((ch) < 128U ? ascii_linebreak[(ch)] : \
738 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700740static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742{
Victor Stinnera85af502013-04-09 21:53:54 +0200743#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
744 do { \
745 TYPE *data = (TYPE *)PTR; \
746 TYPE *end = data + LEN; \
747 Py_UCS4 ch; \
748 for (; data != end; data++) { \
749 ch = *data; \
750 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
751 } \
752 break; \
753 } while (0)
754
Thomas Wouters477c8d52006-05-27 19:21:47 +0000755 /* calculate simple bloom-style bitmask for a given unicode string */
756
Antoine Pitrouf068f942010-01-13 14:19:12 +0000757 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000758
759 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200760 switch (kind) {
761 case PyUnicode_1BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
763 break;
764 case PyUnicode_2BYTE_KIND:
765 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
766 break;
767 case PyUnicode_4BYTE_KIND:
768 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
769 break;
770 default:
771 assert(0);
772 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000773 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200774
775#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000776}
777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300778static int
779ensure_unicode(PyObject *obj)
780{
781 if (!PyUnicode_Check(obj)) {
782 PyErr_Format(PyExc_TypeError,
783 "must be str, not %.100s",
784 Py_TYPE(obj)->tp_name);
785 return -1;
786 }
787 return PyUnicode_READY(obj);
788}
789
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200790/* Compilation of templated routines */
791
792#include "stringlib/asciilib.h"
793#include "stringlib/fastsearch.h"
794#include "stringlib/partition.h"
795#include "stringlib/split.h"
796#include "stringlib/count.h"
797#include "stringlib/find.h"
798#include "stringlib/find_max_char.h"
799#include "stringlib/localeutil.h"
800#include "stringlib/undef.h"
801
802#include "stringlib/ucs1lib.h"
803#include "stringlib/fastsearch.h"
804#include "stringlib/partition.h"
805#include "stringlib/split.h"
806#include "stringlib/count.h"
807#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300808#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200809#include "stringlib/find_max_char.h"
810#include "stringlib/localeutil.h"
811#include "stringlib/undef.h"
812
813#include "stringlib/ucs2lib.h"
814#include "stringlib/fastsearch.h"
815#include "stringlib/partition.h"
816#include "stringlib/split.h"
817#include "stringlib/count.h"
818#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300819#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200820#include "stringlib/find_max_char.h"
821#include "stringlib/localeutil.h"
822#include "stringlib/undef.h"
823
824#include "stringlib/ucs4lib.h"
825#include "stringlib/fastsearch.h"
826#include "stringlib/partition.h"
827#include "stringlib/split.h"
828#include "stringlib/count.h"
829#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300830#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200831#include "stringlib/find_max_char.h"
832#include "stringlib/localeutil.h"
833#include "stringlib/undef.h"
834
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200835#include "stringlib/unicodedefs.h"
836#include "stringlib/fastsearch.h"
837#include "stringlib/count.h"
838#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100839#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200840
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841/* --- Unicode Object ----------------------------------------------------- */
842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200844fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700846static inline Py_ssize_t
847findchar(const void *s, int kind,
848 Py_ssize_t size, Py_UCS4 ch,
849 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200851 switch (kind) {
852 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200853 if ((Py_UCS1) ch != ch)
854 return -1;
855 if (direction > 0)
856 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
857 else
858 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200859 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200860 if ((Py_UCS2) ch != ch)
861 return -1;
862 if (direction > 0)
863 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
864 else
865 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200866 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200867 if (direction > 0)
868 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
869 else
870 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200871 default:
872 assert(0);
873 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875}
876
Victor Stinnerafffce42012-10-03 23:03:17 +0200877#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000878/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200879 earlier.
880
881 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
882 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
883 invalid character in Unicode 6.0. */
884static void
885unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
886{
887 int kind = PyUnicode_KIND(unicode);
888 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
889 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
890 if (length <= old_length)
891 return;
892 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
893}
894#endif
895
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896static PyObject*
897resize_compact(PyObject *unicode, Py_ssize_t length)
898{
899 Py_ssize_t char_size;
900 Py_ssize_t struct_size;
901 Py_ssize_t new_size;
902 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100903 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200904#ifdef Py_DEBUG
905 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
906#endif
907
Victor Stinner79891572012-05-03 13:43:07 +0200908 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200909 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100910 assert(PyUnicode_IS_COMPACT(unicode));
911
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200912 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100913 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200914 struct_size = sizeof(PyASCIIObject);
915 else
916 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200917 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
920 PyErr_NoMemory();
921 return NULL;
922 }
923 new_size = (struct_size + (length + 1) * char_size);
924
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200925 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
926 PyObject_DEL(_PyUnicode_UTF8(unicode));
927 _PyUnicode_UTF8(unicode) = NULL;
928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
929 }
Victor Stinner84def372011-12-11 20:04:56 +0100930 _Py_DEC_REFTOTAL;
931 _Py_ForgetReference(unicode);
932
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300933 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100934 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100935 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200936 PyErr_NoMemory();
937 return NULL;
938 }
Victor Stinner84def372011-12-11 20:04:56 +0100939 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100941
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200943 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100945 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200946 _PyUnicode_WSTR_LENGTH(unicode) = length;
947 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100948 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
949 PyObject_DEL(_PyUnicode_WSTR(unicode));
950 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100951 if (!PyUnicode_IS_ASCII(unicode))
952 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100953 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200954#ifdef Py_DEBUG
955 unicode_fill_invalid(unicode, old_length);
956#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
958 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200959 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960 return unicode;
961}
962
Alexander Belopolsky40018472011-02-26 01:02:56 +0000963static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200964resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965{
Victor Stinner95663112011-10-04 01:03:50 +0200966 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100967 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000970
Victor Stinnerfe226c02011-10-03 03:52:20 +0200971 if (PyUnicode_IS_READY(unicode)) {
972 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200973 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200974 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978
979 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200980 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200981 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
982 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200983
984 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
985 PyErr_NoMemory();
986 return -1;
987 }
988 new_size = (length + 1) * char_size;
989
Victor Stinner7a9105a2011-12-12 00:13:42 +0100990 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
991 {
992 PyObject_DEL(_PyUnicode_UTF8(unicode));
993 _PyUnicode_UTF8(unicode) = NULL;
994 _PyUnicode_UTF8_LENGTH(unicode) = 0;
995 }
996
Victor Stinnerfe226c02011-10-03 03:52:20 +0200997 data = (PyObject *)PyObject_REALLOC(data, new_size);
998 if (data == NULL) {
999 PyErr_NoMemory();
1000 return -1;
1001 }
1002 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001003 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
1007 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001008 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001009 _PyUnicode_UTF8_LENGTH(unicode) = length;
1010 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001011 _PyUnicode_LENGTH(unicode) = length;
1012 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinner95663112011-10-04 01:03:50 +02001016 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001017 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001020 }
Victor Stinner95663112011-10-04 01:03:50 +02001021 assert(_PyUnicode_WSTR(unicode) != NULL);
1022
1023 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001024 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001025 PyErr_NoMemory();
1026 return -1;
1027 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001029 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001031 if (!wstr) {
1032 PyErr_NoMemory();
1033 return -1;
1034 }
1035 _PyUnicode_WSTR(unicode) = wstr;
1036 _PyUnicode_WSTR(unicode)[length] = 0;
1037 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 return 0;
1040}
1041
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042static PyObject*
1043resize_copy(PyObject *unicode, Py_ssize_t length)
1044{
1045 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001046 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001048
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001049 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050
1051 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1052 if (copy == NULL)
1053 return NULL;
1054
1055 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001056 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001058 }
1059 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001060 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001061
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001062 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 if (w == NULL)
1064 return NULL;
1065 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1066 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001067 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001068 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001069 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 }
1071}
1072
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001074 Ux0000 terminated; some code (e.g. new_identifier)
1075 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076
1077 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001078 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079
1080*/
1081
Alexander Belopolsky40018472011-02-26 01:02:56 +00001082static PyUnicodeObject *
1083_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001085 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 if (length == 0 && unicode_empty != NULL) {
1090 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001091 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
1093
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001094 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001095 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001096 return (PyUnicodeObject *)PyErr_NoMemory();
1097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 if (length < 0) {
1099 PyErr_SetString(PyExc_SystemError,
1100 "Negative size passed to _PyUnicode_New");
1101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 }
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1105 if (unicode == NULL)
1106 return NULL;
1107 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001108
1109 _PyUnicode_WSTR_LENGTH(unicode) = length;
1110 _PyUnicode_HASH(unicode) = -1;
1111 _PyUnicode_STATE(unicode).interned = 0;
1112 _PyUnicode_STATE(unicode).kind = 0;
1113 _PyUnicode_STATE(unicode).compact = 0;
1114 _PyUnicode_STATE(unicode).ready = 0;
1115 _PyUnicode_STATE(unicode).ascii = 0;
1116 _PyUnicode_DATA_ANY(unicode) = NULL;
1117 _PyUnicode_LENGTH(unicode) = 0;
1118 _PyUnicode_UTF8(unicode) = NULL;
1119 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1122 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001123 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001125 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127
Jeremy Hyltond8082792003-09-16 19:41:39 +00001128 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001129 * the caller fails before initializing str -- unicode_resize()
1130 * reads str[0], and the Keep-Alive optimization can keep memory
1131 * allocated for str alive across a call to unicode_dealloc(unicode).
1132 * We don't want unicode_resize to read uninitialized memory in
1133 * that case.
1134 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 _PyUnicode_WSTR(unicode)[0] = 0;
1136 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001137
Victor Stinner7931d9a2011-11-04 00:22:48 +01001138 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 return unicode;
1140}
1141
Victor Stinnerf42dc442011-10-02 23:33:16 +02001142static const char*
1143unicode_kind_name(PyObject *unicode)
1144{
Victor Stinner42dfd712011-10-03 14:41:45 +02001145 /* don't check consistency: unicode_kind_name() is called from
1146 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001147 if (!PyUnicode_IS_COMPACT(unicode))
1148 {
1149 if (!PyUnicode_IS_READY(unicode))
1150 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 {
1153 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001154 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001155 return "legacy ascii";
1156 else
1157 return "legacy latin1";
1158 case PyUnicode_2BYTE_KIND:
1159 return "legacy UCS2";
1160 case PyUnicode_4BYTE_KIND:
1161 return "legacy UCS4";
1162 default:
1163 return "<legacy invalid kind>";
1164 }
1165 }
1166 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001167 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001169 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001170 return "ascii";
1171 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001172 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001174 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001175 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001176 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 default:
1178 return "<invalid compact kind>";
1179 }
1180}
1181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183/* Functions wrapping macros for use in debugger */
1184char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001185 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186}
1187
1188void *_PyUnicode_compact_data(void *unicode) {
1189 return _PyUnicode_COMPACT_DATA(unicode);
1190}
1191void *_PyUnicode_data(void *unicode){
1192 printf("obj %p\n", unicode);
1193 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1194 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1195 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1196 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1197 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1198 return PyUnicode_DATA(unicode);
1199}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001200
1201void
1202_PyUnicode_Dump(PyObject *op)
1203{
1204 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1206 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1207 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera849a4b2011-10-03 12:12:11 +02001209 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001210 {
1211 if (ascii->state.ascii)
1212 data = (ascii + 1);
1213 else
1214 data = (compact + 1);
1215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 else
1217 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001218 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1219 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001220
Victor Stinnera849a4b2011-10-03 12:12:11 +02001221 if (ascii->wstr == data)
1222 printf("shared ");
1223 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001224
Victor Stinnera3b334d2011-10-03 13:53:37 +02001225 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001226 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001227 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1228 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001229 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1230 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001231 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001232 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001233}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234#endif
1235
1236PyObject *
1237PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1238{
1239 PyObject *obj;
1240 PyCompactUnicodeObject *unicode;
1241 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001242 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001243 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 Py_ssize_t char_size;
1245 Py_ssize_t struct_size;
1246
1247 /* Optimization for empty strings */
1248 if (size == 0 && unicode_empty != NULL) {
1249 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001250 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
1252
Victor Stinner9e9d6892011-10-04 01:02:02 +02001253 is_ascii = 0;
1254 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 struct_size = sizeof(PyCompactUnicodeObject);
1256 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001257 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 char_size = 1;
1259 is_ascii = 1;
1260 struct_size = sizeof(PyASCIIObject);
1261 }
1262 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001263 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 char_size = 1;
1265 }
1266 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001267 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 char_size = 2;
1269 if (sizeof(wchar_t) == 2)
1270 is_sharing = 1;
1271 }
1272 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001273 if (maxchar > MAX_UNICODE) {
1274 PyErr_SetString(PyExc_SystemError,
1275 "invalid maximum character passed to PyUnicode_New");
1276 return NULL;
1277 }
Victor Stinner8f825062012-04-27 13:55:39 +02001278 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 char_size = 4;
1280 if (sizeof(wchar_t) == 4)
1281 is_sharing = 1;
1282 }
1283
1284 /* Ensure we won't overflow the size. */
1285 if (size < 0) {
1286 PyErr_SetString(PyExc_SystemError,
1287 "Negative size passed to PyUnicode_New");
1288 return NULL;
1289 }
1290 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1291 return PyErr_NoMemory();
1292
1293 /* Duplicated allocation code from _PyObject_New() instead of a call to
1294 * PyObject_New() so we are able to allocate space for the object and
1295 * it's data buffer.
1296 */
1297 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1298 if (obj == NULL)
1299 return PyErr_NoMemory();
1300 obj = PyObject_INIT(obj, &PyUnicode_Type);
1301 if (obj == NULL)
1302 return NULL;
1303
1304 unicode = (PyCompactUnicodeObject *)obj;
1305 if (is_ascii)
1306 data = ((PyASCIIObject*)obj) + 1;
1307 else
1308 data = unicode + 1;
1309 _PyUnicode_LENGTH(unicode) = size;
1310 _PyUnicode_HASH(unicode) = -1;
1311 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001312 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 _PyUnicode_STATE(unicode).compact = 1;
1314 _PyUnicode_STATE(unicode).ready = 1;
1315 _PyUnicode_STATE(unicode).ascii = is_ascii;
1316 if (is_ascii) {
1317 ((char*)data)[size] = 0;
1318 _PyUnicode_WSTR(unicode) = NULL;
1319 }
Victor Stinner8f825062012-04-27 13:55:39 +02001320 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ((char*)data)[size] = 0;
1322 _PyUnicode_WSTR(unicode) = NULL;
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001325 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 else {
1328 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001329 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001330 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001332 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 ((Py_UCS4*)data)[size] = 0;
1334 if (is_sharing) {
1335 _PyUnicode_WSTR_LENGTH(unicode) = size;
1336 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1337 }
1338 else {
1339 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1340 _PyUnicode_WSTR(unicode) = NULL;
1341 }
1342 }
Victor Stinner8f825062012-04-27 13:55:39 +02001343#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001344 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001345#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001346 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 return obj;
1348}
1349
1350#if SIZEOF_WCHAR_T == 2
1351/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1352 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001353 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354
1355 This function assumes that unicode can hold one more code point than wstr
1356 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001357static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001359 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360{
1361 const wchar_t *iter;
1362 Py_UCS4 *ucs4_out;
1363
Victor Stinner910337b2011-10-03 03:20:16 +02001364 assert(unicode != NULL);
1365 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1367 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1368
1369 for (iter = begin; iter < end; ) {
1370 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1371 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner551ac952011-11-29 22:58:13 +01001376 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 iter += 2;
1378 }
1379 else {
1380 *ucs4_out++ = *iter;
1381 iter++;
1382 }
1383 }
1384 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1385 _PyUnicode_GET_LENGTH(unicode)));
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387}
1388#endif
1389
Victor Stinnercd9950f2011-10-02 00:34:53 +02001390static int
Victor Stinner488fa492011-12-12 00:01:39 +01001391unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001392{
Victor Stinner488fa492011-12-12 00:01:39 +01001393 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001394 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001395 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001396 return -1;
1397 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001398 return 0;
1399}
1400
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001401static int
1402_copy_characters(PyObject *to, Py_ssize_t to_start,
1403 PyObject *from, Py_ssize_t from_start,
1404 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001406 unsigned int from_kind, to_kind;
1407 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinneree4544c2012-05-09 22:24:08 +02001409 assert(0 <= how_many);
1410 assert(0 <= from_start);
1411 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001413 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001414 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerd3f08822012-05-29 12:57:52 +02001416 assert(PyUnicode_Check(to));
1417 assert(PyUnicode_IS_READY(to));
1418 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1419
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001420 if (how_many == 0)
1421 return 0;
1422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001424 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001426 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427
Victor Stinnerf1852262012-06-16 16:38:26 +02001428#ifdef Py_DEBUG
1429 if (!check_maxchar
1430 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1431 {
1432 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1433 Py_UCS4 ch;
1434 Py_ssize_t i;
1435 for (i=0; i < how_many; i++) {
1436 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1437 assert(ch <= to_maxchar);
1438 }
1439 }
1440#endif
1441
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001442 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001443 if (check_maxchar
1444 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1445 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001446 /* Writing Latin-1 characters into an ASCII string requires to
1447 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001448 Py_UCS4 max_char;
1449 max_char = ucs1lib_find_max_char(from_data,
1450 (Py_UCS1*)from_data + how_many);
1451 if (max_char >= 128)
1452 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001453 }
Christian Heimesf051e432016-09-13 20:22:02 +02001454 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001455 (char*)from_data + from_kind * from_start,
1456 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001458 else if (from_kind == PyUnicode_1BYTE_KIND
1459 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 {
1461 _PyUnicode_CONVERT_BYTES(
1462 Py_UCS1, Py_UCS2,
1463 PyUnicode_1BYTE_DATA(from) + from_start,
1464 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1465 PyUnicode_2BYTE_DATA(to) + to_start
1466 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001467 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001468 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001469 && to_kind == PyUnicode_4BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS1, Py_UCS4,
1473 PyUnicode_1BYTE_DATA(from) + from_start,
1474 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_4BYTE_DATA(to) + to_start
1476 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001477 }
1478 else if (from_kind == PyUnicode_2BYTE_KIND
1479 && to_kind == PyUnicode_4BYTE_KIND)
1480 {
1481 _PyUnicode_CONVERT_BYTES(
1482 Py_UCS2, Py_UCS4,
1483 PyUnicode_2BYTE_DATA(from) + from_start,
1484 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1485 PyUnicode_4BYTE_DATA(to) + to_start
1486 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001487 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001489 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1490
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001491 if (!check_maxchar) {
1492 if (from_kind == PyUnicode_2BYTE_KIND
1493 && to_kind == PyUnicode_1BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS2, Py_UCS1,
1497 PyUnicode_2BYTE_DATA(from) + from_start,
1498 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_1BYTE_DATA(to) + to_start
1500 );
1501 }
1502 else if (from_kind == PyUnicode_4BYTE_KIND
1503 && to_kind == PyUnicode_1BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS4, Py_UCS1,
1507 PyUnicode_4BYTE_DATA(from) + from_start,
1508 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_1BYTE_DATA(to) + to_start
1510 );
1511 }
1512 else if (from_kind == PyUnicode_4BYTE_KIND
1513 && to_kind == PyUnicode_2BYTE_KIND)
1514 {
1515 _PyUnicode_CONVERT_BYTES(
1516 Py_UCS4, Py_UCS2,
1517 PyUnicode_4BYTE_DATA(from) + from_start,
1518 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1519 PyUnicode_2BYTE_DATA(to) + to_start
1520 );
1521 }
1522 else {
1523 assert(0);
1524 return -1;
1525 }
1526 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001527 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001528 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 Py_ssize_t i;
1531
Victor Stinnera0702ab2011-09-29 14:14:38 +02001532 for (i=0; i < how_many; i++) {
1533 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001534 if (ch > to_maxchar)
1535 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001536 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1537 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001538 }
1539 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001540 return 0;
1541}
1542
Victor Stinnerd3f08822012-05-29 12:57:52 +02001543void
1544_PyUnicode_FastCopyCharacters(
1545 PyObject *to, Py_ssize_t to_start,
1546 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547{
1548 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1549}
1550
1551Py_ssize_t
1552PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1553 PyObject *from, Py_ssize_t from_start,
1554 Py_ssize_t how_many)
1555{
1556 int err;
1557
1558 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1559 PyErr_BadInternalCall();
1560 return -1;
1561 }
1562
Benjamin Petersonbac79492012-01-14 13:34:47 -05001563 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001565 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001566 return -1;
1567
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001568 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001569 PyErr_SetString(PyExc_IndexError, "string index out of range");
1570 return -1;
1571 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001572 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001573 PyErr_SetString(PyExc_IndexError, "string index out of range");
1574 return -1;
1575 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001576 if (how_many < 0) {
1577 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1578 return -1;
1579 }
1580 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1582 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001583 "Cannot write %zi characters at %zi "
1584 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001585 how_many, to_start, PyUnicode_GET_LENGTH(to));
1586 return -1;
1587 }
1588
1589 if (how_many == 0)
1590 return 0;
1591
Victor Stinner488fa492011-12-12 00:01:39 +01001592 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001593 return -1;
1594
1595 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1596 if (err) {
1597 PyErr_Format(PyExc_SystemError,
1598 "Cannot copy %s characters "
1599 "into a string of %s characters",
1600 unicode_kind_name(from),
1601 unicode_kind_name(to));
1602 return -1;
1603 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001604 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605}
1606
Victor Stinner17222162011-09-28 22:15:37 +02001607/* Find the maximum code point and count the number of surrogate pairs so a
1608 correct string length can be computed before converting a string to UCS4.
1609 This function counts single surrogates as a character and not as a pair.
1610
1611 Return 0 on success, or -1 on error. */
1612static int
1613find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1614 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615{
1616 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001617 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618
Victor Stinnerc53be962011-10-02 21:33:54 +02001619 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 *num_surrogates = 0;
1621 *maxchar = 0;
1622
1623 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001624#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001625 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1626 && (iter+1) < end
1627 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1628 {
1629 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1630 ++(*num_surrogates);
1631 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001635 {
1636 ch = *iter;
1637 iter++;
1638 }
1639 if (ch > *maxchar) {
1640 *maxchar = ch;
1641 if (*maxchar > MAX_UNICODE) {
1642 PyErr_Format(PyExc_ValueError,
1643 "character U+%x is not in range [U+0000; U+10ffff]",
1644 ch);
1645 return -1;
1646 }
1647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 }
1649 return 0;
1650}
1651
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001652int
1653_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654{
1655 wchar_t *end;
1656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658#if SIZEOF_WCHAR_T == 2
1659 Py_ssize_t length_wo_surrogates;
1660#endif
1661
Georg Brandl7597add2011-10-05 16:36:47 +02001662 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001663 strings were created using _PyObject_New() and where no canonical
1664 representation (the str field) has been set yet aka strings
1665 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001666 assert(_PyUnicode_CHECK(unicode));
1667 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001669 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001670 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001671 /* Actually, it should neither be interned nor be anything else: */
1672 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001675 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001676 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678
1679 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001680 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1681 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 PyErr_NoMemory();
1683 return -1;
1684 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001685 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 _PyUnicode_WSTR(unicode), end,
1687 PyUnicode_1BYTE_DATA(unicode));
1688 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1689 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1690 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1691 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001692 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 }
1696 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001697 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001698 _PyUnicode_UTF8(unicode) = NULL;
1699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 }
1701 PyObject_FREE(_PyUnicode_WSTR(unicode));
1702 _PyUnicode_WSTR(unicode) = NULL;
1703 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1704 }
1705 /* In this case we might have to convert down from 4-byte native
1706 wchar_t to 2-byte unicode. */
1707 else if (maxchar < 65536) {
1708 assert(num_surrogates == 0 &&
1709 "FindMaxCharAndNumSurrogatePairs() messed up");
1710
Victor Stinner506f5922011-09-28 22:34:18 +02001711#if SIZEOF_WCHAR_T == 2
1712 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001713 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719#else
1720 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001721 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001722 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001723 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001724 PyErr_NoMemory();
1725 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 }
Victor Stinner506f5922011-09-28 22:34:18 +02001727 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1728 _PyUnicode_WSTR(unicode), end,
1729 PyUnicode_2BYTE_DATA(unicode));
1730 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1731 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1732 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001733 _PyUnicode_UTF8(unicode) = NULL;
1734 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001735 PyObject_FREE(_PyUnicode_WSTR(unicode));
1736 _PyUnicode_WSTR(unicode) = NULL;
1737 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1738#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 }
1740 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1741 else {
1742#if SIZEOF_WCHAR_T == 2
1743 /* in case the native representation is 2-bytes, we need to allocate a
1744 new normalized 4-byte version. */
1745 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001746 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1747 PyErr_NoMemory();
1748 return -1;
1749 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001750 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1751 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 PyErr_NoMemory();
1753 return -1;
1754 }
1755 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001759 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001761 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765#else
1766 assert(num_surrogates == 0);
1767
Victor Stinnerc3c74152011-10-02 20:39:55 +02001768 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001770 _PyUnicode_UTF8(unicode) = NULL;
1771 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1773#endif
1774 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1775 }
1776 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001777 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 return 0;
1779}
1780
Alexander Belopolsky40018472011-02-26 01:02:56 +00001781static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001782unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783{
Walter Dörwald16807132007-05-25 13:52:07 +00001784 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 case SSTATE_NOT_INTERNED:
1786 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001787
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 case SSTATE_INTERNED_MORTAL:
1789 /* revive dead object temporarily for DelItem */
1790 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001791 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 Py_FatalError(
1793 "deletion of interned string failed");
1794 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001795
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 case SSTATE_INTERNED_IMMORTAL:
1797 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001798 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001799
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 default:
1801 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001802 }
1803
Victor Stinner03490912011-10-03 23:45:12 +02001804 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001806 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001807 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001808 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1809 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001811 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812}
1813
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001814#ifdef Py_DEBUG
1815static int
1816unicode_is_singleton(PyObject *unicode)
1817{
1818 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1819 if (unicode == unicode_empty)
1820 return 1;
1821 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1822 {
1823 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1824 if (ch < 256 && unicode_latin1[ch] == unicode)
1825 return 1;
1826 }
1827 return 0;
1828}
1829#endif
1830
Alexander Belopolsky40018472011-02-26 01:02:56 +00001831static int
Victor Stinner488fa492011-12-12 00:01:39 +01001832unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001833{
Victor Stinner488fa492011-12-12 00:01:39 +01001834 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001835 if (Py_REFCNT(unicode) != 1)
1836 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001837 if (_PyUnicode_HASH(unicode) != -1)
1838 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001839 if (PyUnicode_CHECK_INTERNED(unicode))
1840 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001841 if (!PyUnicode_CheckExact(unicode))
1842 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001843#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001844 /* singleton refcount is greater than 1 */
1845 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001846#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 return 1;
1848}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849
Victor Stinnerfe226c02011-10-03 03:52:20 +02001850static int
1851unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1852{
1853 PyObject *unicode;
1854 Py_ssize_t old_length;
1855
1856 assert(p_unicode != NULL);
1857 unicode = *p_unicode;
1858
1859 assert(unicode != NULL);
1860 assert(PyUnicode_Check(unicode));
1861 assert(0 <= length);
1862
Victor Stinner910337b2011-10-03 03:20:16 +02001863 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 old_length = PyUnicode_WSTR_LENGTH(unicode);
1865 else
1866 old_length = PyUnicode_GET_LENGTH(unicode);
1867 if (old_length == length)
1868 return 0;
1869
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001870 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001871 _Py_INCREF_UNICODE_EMPTY();
1872 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001874 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001875 return 0;
1876 }
1877
Victor Stinner488fa492011-12-12 00:01:39 +01001878 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001879 PyObject *copy = resize_copy(unicode, length);
1880 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001882 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001884 }
1885
Victor Stinnerfe226c02011-10-03 03:52:20 +02001886 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001887 PyObject *new_unicode = resize_compact(unicode, length);
1888 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001890 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001891 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001893 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894}
1895
Alexander Belopolsky40018472011-02-26 01:02:56 +00001896int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001898{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001899 PyObject *unicode;
1900 if (p_unicode == NULL) {
1901 PyErr_BadInternalCall();
1902 return -1;
1903 }
1904 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001905 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001906 {
1907 PyErr_BadInternalCall();
1908 return -1;
1909 }
1910 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001911}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001912
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001913/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001914
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001915 WARNING: The function doesn't copy the terminating null character and
1916 doesn't check the maximum character (may write a latin1 character in an
1917 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001918static void
1919unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1920 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001921{
1922 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1923 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001924 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001925
1926 switch (kind) {
1927 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001929#ifdef Py_DEBUG
1930 if (PyUnicode_IS_ASCII(unicode)) {
1931 Py_UCS4 maxchar = ucs1lib_find_max_char(
1932 (const Py_UCS1*)str,
1933 (const Py_UCS1*)str + len);
1934 assert(maxchar < 128);
1935 }
1936#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001937 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001938 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 case PyUnicode_2BYTE_KIND: {
1941 Py_UCS2 *start = (Py_UCS2 *)data + index;
1942 Py_UCS2 *ucs2 = start;
1943 assert(index <= PyUnicode_GET_LENGTH(unicode));
1944
Victor Stinner184252a2012-06-16 02:57:41 +02001945 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001946 *ucs2 = (Py_UCS2)*str;
1947
1948 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001949 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001950 }
1951 default: {
1952 Py_UCS4 *start = (Py_UCS4 *)data + index;
1953 Py_UCS4 *ucs4 = start;
1954 assert(kind == PyUnicode_4BYTE_KIND);
1955 assert(index <= PyUnicode_GET_LENGTH(unicode));
1956
Victor Stinner184252a2012-06-16 02:57:41 +02001957 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001958 *ucs4 = (Py_UCS4)*str;
1959
1960 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001961 }
1962 }
1963}
1964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965static PyObject*
1966get_latin1_char(unsigned char ch)
1967{
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001970 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 if (!unicode)
1972 return NULL;
1973 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001974 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 unicode_latin1[ch] = unicode;
1976 }
1977 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001978 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979}
1980
Victor Stinner985a82a2014-01-03 12:53:47 +01001981static PyObject*
1982unicode_char(Py_UCS4 ch)
1983{
1984 PyObject *unicode;
1985
1986 assert(ch <= MAX_UNICODE);
1987
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001988 if (ch < 256)
1989 return get_latin1_char(ch);
1990
Victor Stinner985a82a2014-01-03 12:53:47 +01001991 unicode = PyUnicode_New(1, ch);
1992 if (unicode == NULL)
1993 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001994
1995 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1996 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001998 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001999 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2000 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2001 }
2002 assert(_PyUnicode_CheckConsistency(unicode, 1));
2003 return unicode;
2004}
2005
Alexander Belopolsky40018472011-02-26 01:02:56 +00002006PyObject *
2007PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002009 if (u == NULL)
2010 return (PyObject*)_PyUnicode_New(size);
2011
2012 if (size < 0) {
2013 PyErr_BadInternalCall();
2014 return NULL;
2015 }
2016
2017 return PyUnicode_FromWideChar(u, size);
2018}
2019
2020PyObject *
2021PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2022{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002023 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 Py_UCS4 maxchar = 0;
2025 Py_ssize_t num_surrogates;
2026
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002027 if (u == NULL && size != 0) {
2028 PyErr_BadInternalCall();
2029 return NULL;
2030 }
2031
2032 if (size == -1) {
2033 size = wcslen(u);
2034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036 /* If the Unicode data is known at construction time, we can apply
2037 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002040 if (size == 0)
2041 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 /* Single character Unicode objects in the Latin-1 range are
2044 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002045 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return get_latin1_char((unsigned char)*u);
2047
2048 /* If not empty and not single character, copy the Unicode data
2049 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002050 if (find_maxchar_surrogates(u, u + size,
2051 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 return NULL;
2053
Victor Stinner8faf8212011-12-08 22:14:11 +01002054 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 if (!unicode)
2056 return NULL;
2057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 switch (PyUnicode_KIND(unicode)) {
2059 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002060 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2062 break;
2063 case PyUnicode_2BYTE_KIND:
2064#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002065 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002067 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2069#endif
2070 break;
2071 case PyUnicode_4BYTE_KIND:
2072#if SIZEOF_WCHAR_T == 2
2073 /* This is the only case which has to process surrogates, thus
2074 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002075 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076#else
2077 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002078 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079#endif
2080 break;
2081 default:
2082 assert(0 && "Impossible state");
2083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002085 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086}
2087
Alexander Belopolsky40018472011-02-26 01:02:56 +00002088PyObject *
2089PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002090{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 if (size < 0) {
2092 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002093 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002094 return NULL;
2095 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002096 if (u != NULL)
2097 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2098 else
2099 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002100}
2101
Alexander Belopolsky40018472011-02-26 01:02:56 +00002102PyObject *
2103PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002104{
2105 size_t size = strlen(u);
2106 if (size > PY_SSIZE_T_MAX) {
2107 PyErr_SetString(PyExc_OverflowError, "input too long");
2108 return NULL;
2109 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002110 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002111}
2112
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002113PyObject *
2114_PyUnicode_FromId(_Py_Identifier *id)
2115{
2116 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002117 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2118 strlen(id->string),
2119 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002120 if (!id->object)
2121 return NULL;
2122 PyUnicode_InternInPlace(&id->object);
2123 assert(!id->next);
2124 id->next = static_strings;
2125 static_strings = id;
2126 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002127 return id->object;
2128}
2129
2130void
2131_PyUnicode_ClearStaticStrings()
2132{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002133 _Py_Identifier *tmp, *s = static_strings;
2134 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002135 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002136 tmp = s->next;
2137 s->next = NULL;
2138 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002140 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002141}
2142
Benjamin Peterson0df54292012-03-26 14:50:32 -04002143/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002144
Victor Stinnerd3f08822012-05-29 12:57:52 +02002145PyObject*
2146_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002147{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002148 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002149 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002150 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002152 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002153#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002154 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002155 }
Victor Stinner785938e2011-12-11 20:09:03 +01002156 unicode = PyUnicode_New(size, 127);
2157 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002158 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002159 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2160 assert(_PyUnicode_CheckConsistency(unicode, 1));
2161 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002162}
2163
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002164static Py_UCS4
2165kind_maxchar_limit(unsigned int kind)
2166{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002167 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002168 case PyUnicode_1BYTE_KIND:
2169 return 0x80;
2170 case PyUnicode_2BYTE_KIND:
2171 return 0x100;
2172 case PyUnicode_4BYTE_KIND:
2173 return 0x10000;
2174 default:
2175 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002176 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002177 }
2178}
2179
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002180static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002181align_maxchar(Py_UCS4 maxchar)
2182{
2183 if (maxchar <= 127)
2184 return 127;
2185 else if (maxchar <= 255)
2186 return 255;
2187 else if (maxchar <= 65535)
2188 return 65535;
2189 else
2190 return MAX_UNICODE;
2191}
2192
Victor Stinner702c7342011-10-05 13:50:52 +02002193static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002194_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002197 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002198
Serhiy Storchaka678db842013-01-26 12:16:36 +02002199 if (size == 0)
2200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002201 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002202 if (size == 1)
2203 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002204
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002205 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 if (!res)
2208 return NULL;
2209 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002210 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002212}
2213
Victor Stinnere57b1c02011-09-28 22:20:48 +02002214static PyObject*
2215_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216{
2217 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002218 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219
Serhiy Storchaka678db842013-01-26 12:16:36 +02002220 if (size == 0)
2221 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002223 if (size == 1)
2224 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002225
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002226 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002227 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 if (!res)
2229 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002230 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002232 else {
2233 _PyUnicode_CONVERT_BYTES(
2234 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2235 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002236 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 return res;
2238}
2239
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240static PyObject*
2241_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242{
2243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002249 if (size == 1)
2250 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (max_char < 256)
2257 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2258 PyUnicode_1BYTE_DATA(res));
2259 else if (max_char < 0x10000)
2260 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2261 PyUnicode_2BYTE_DATA(res));
2262 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002264 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 return res;
2266}
2267
2268PyObject*
2269PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2270{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002271 if (size < 0) {
2272 PyErr_SetString(PyExc_ValueError, "size must be positive");
2273 return NULL;
2274 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002275 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002281 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002282 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002283 PyErr_SetString(PyExc_SystemError, "invalid kind");
2284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286}
2287
Victor Stinnerece58de2012-04-23 23:36:38 +02002288Py_UCS4
2289_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2290{
2291 enum PyUnicode_Kind kind;
2292 void *startptr, *endptr;
2293
2294 assert(PyUnicode_IS_READY(unicode));
2295 assert(0 <= start);
2296 assert(end <= PyUnicode_GET_LENGTH(unicode));
2297 assert(start <= end);
2298
2299 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300 return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302 if (start == end)
2303 return 127;
2304
Victor Stinner94d558b2012-04-27 22:26:58 +02002305 if (PyUnicode_IS_ASCII(unicode))
2306 return 127;
2307
Victor Stinnerece58de2012-04-23 23:36:38 +02002308 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002309 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002310 endptr = (char *)startptr + end * kind;
2311 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002312 switch(kind) {
2313 case PyUnicode_1BYTE_KIND:
2314 return ucs1lib_find_max_char(startptr, endptr);
2315 case PyUnicode_2BYTE_KIND:
2316 return ucs2lib_find_max_char(startptr, endptr);
2317 case PyUnicode_4BYTE_KIND:
2318 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002319 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002320 assert(0);
2321 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002322 }
2323}
2324
Victor Stinner25a4b292011-10-06 12:31:55 +02002325/* Ensure that a string uses the most efficient storage, if it is not the
2326 case: create a new string with of the right kind. Write NULL into *p_unicode
2327 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002328static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002329unicode_adjust_maxchar(PyObject **p_unicode)
2330{
2331 PyObject *unicode, *copy;
2332 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002333 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002334 unsigned int kind;
2335
2336 assert(p_unicode != NULL);
2337 unicode = *p_unicode;
2338 assert(PyUnicode_IS_READY(unicode));
2339 if (PyUnicode_IS_ASCII(unicode))
2340 return;
2341
2342 len = PyUnicode_GET_LENGTH(unicode);
2343 kind = PyUnicode_KIND(unicode);
2344 if (kind == PyUnicode_1BYTE_KIND) {
2345 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002346 max_char = ucs1lib_find_max_char(u, u + len);
2347 if (max_char >= 128)
2348 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002349 }
2350 else if (kind == PyUnicode_2BYTE_KIND) {
2351 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002352 max_char = ucs2lib_find_max_char(u, u + len);
2353 if (max_char >= 256)
2354 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002355 }
2356 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002359 max_char = ucs4lib_find_max_char(u, u + len);
2360 if (max_char >= 0x10000)
2361 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002362 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002364 if (copy != NULL)
2365 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 Py_DECREF(unicode);
2367 *p_unicode = copy;
2368}
2369
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002371_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002372{
Victor Stinner87af4f22011-11-21 23:03:47 +01002373 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002374 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002375
Victor Stinner034f6cf2011-09-30 02:26:44 +02002376 if (!PyUnicode_Check(unicode)) {
2377 PyErr_BadInternalCall();
2378 return NULL;
2379 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002380 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382
Victor Stinner87af4f22011-11-21 23:03:47 +01002383 length = PyUnicode_GET_LENGTH(unicode);
2384 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002385 if (!copy)
2386 return NULL;
2387 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2388
Christian Heimesf051e432016-09-13 20:22:02 +02002389 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002390 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002391 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002392 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002393}
2394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395
Victor Stinnerbc603d12011-10-02 01:00:40 +02002396/* Widen Unicode objects to larger buffers. Don't write terminating null
2397 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398
2399void*
2400_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2401{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002402 Py_ssize_t len;
2403 void *result;
2404 unsigned int skind;
2405
Benjamin Petersonbac79492012-01-14 13:34:47 -05002406 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002407 return NULL;
2408
2409 len = PyUnicode_GET_LENGTH(s);
2410 skind = PyUnicode_KIND(s);
2411 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002415 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002417 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002418 if (!result)
2419 return PyErr_NoMemory();
2420 assert(skind == PyUnicode_1BYTE_KIND);
2421 _PyUnicode_CONVERT_BYTES(
2422 Py_UCS1, Py_UCS2,
2423 PyUnicode_1BYTE_DATA(s),
2424 PyUnicode_1BYTE_DATA(s) + len,
2425 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002428 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002429 if (!result)
2430 return PyErr_NoMemory();
2431 if (skind == PyUnicode_2BYTE_KIND) {
2432 _PyUnicode_CONVERT_BYTES(
2433 Py_UCS2, Py_UCS4,
2434 PyUnicode_2BYTE_DATA(s),
2435 PyUnicode_2BYTE_DATA(s) + len,
2436 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 else {
2439 assert(skind == PyUnicode_1BYTE_KIND);
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS1, Py_UCS4,
2442 PyUnicode_1BYTE_DATA(s),
2443 PyUnicode_1BYTE_DATA(s) + len,
2444 result);
2445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 default:
2448 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Victor Stinner01698042011-10-04 00:04:26 +02002450 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return NULL;
2452}
2453
2454static Py_UCS4*
2455as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2456 int copy_null)
2457{
2458 int kind;
2459 void *data;
2460 Py_ssize_t len, targetlen;
2461 if (PyUnicode_READY(string) == -1)
2462 return NULL;
2463 kind = PyUnicode_KIND(string);
2464 data = PyUnicode_DATA(string);
2465 len = PyUnicode_GET_LENGTH(string);
2466 targetlen = len;
2467 if (copy_null)
2468 targetlen++;
2469 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002470 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 if (!target) {
2472 PyErr_NoMemory();
2473 return NULL;
2474 }
2475 }
2476 else {
2477 if (targetsize < targetlen) {
2478 PyErr_Format(PyExc_SystemError,
2479 "string is longer than the buffer");
2480 if (copy_null && 0 < targetsize)
2481 target[0] = 0;
2482 return NULL;
2483 }
2484 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002485 if (kind == PyUnicode_1BYTE_KIND) {
2486 Py_UCS1 *start = (Py_UCS1 *) data;
2487 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002489 else if (kind == PyUnicode_2BYTE_KIND) {
2490 Py_UCS2 *start = (Py_UCS2 *) data;
2491 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2492 }
2493 else {
2494 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002495 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 if (copy_null)
2498 target[len] = 0;
2499 return target;
2500}
2501
2502Py_UCS4*
2503PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2504 int copy_null)
2505{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002506 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 PyErr_BadInternalCall();
2508 return NULL;
2509 }
2510 return as_ucs4(string, target, targetsize, copy_null);
2511}
2512
2513Py_UCS4*
2514PyUnicode_AsUCS4Copy(PyObject *string)
2515{
2516 return as_ucs4(string, NULL, 0, 1);
2517}
2518
Victor Stinner15a11362012-10-06 23:48:20 +02002519/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002520 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2521 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2522#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002523
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002524static int
2525unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2526 Py_ssize_t width, Py_ssize_t precision)
2527{
2528 Py_ssize_t length, fill, arglen;
2529 Py_UCS4 maxchar;
2530
2531 if (PyUnicode_READY(str) == -1)
2532 return -1;
2533
2534 length = PyUnicode_GET_LENGTH(str);
2535 if ((precision == -1 || precision >= length)
2536 && width <= length)
2537 return _PyUnicodeWriter_WriteStr(writer, str);
2538
2539 if (precision != -1)
2540 length = Py_MIN(precision, length);
2541
2542 arglen = Py_MAX(length, width);
2543 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2544 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2545 else
2546 maxchar = writer->maxchar;
2547
2548 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2549 return -1;
2550
2551 if (width > length) {
2552 fill = width - length;
2553 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2554 return -1;
2555 writer->pos += fill;
2556 }
2557
2558 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2559 str, 0, length);
2560 writer->pos += length;
2561 return 0;
2562}
2563
2564static int
2565unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2566 Py_ssize_t width, Py_ssize_t precision)
2567{
2568 /* UTF-8 */
2569 Py_ssize_t length;
2570 PyObject *unicode;
2571 int res;
2572
2573 length = strlen(str);
2574 if (precision != -1)
2575 length = Py_MIN(length, precision);
2576 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2577 if (unicode == NULL)
2578 return -1;
2579
2580 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2581 Py_DECREF(unicode);
2582 return res;
2583}
2584
Victor Stinner96865452011-03-01 23:44:09 +00002585static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002586unicode_fromformat_arg(_PyUnicodeWriter *writer,
2587 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002588{
Victor Stinnere215d962012-10-06 23:03:36 +02002589 const char *p;
2590 Py_ssize_t len;
2591 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 Py_ssize_t width;
2593 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 int longflag;
2595 int longlongflag;
2596 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002598
2599 p = f;
2600 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002601 zeropad = 0;
2602 if (*f == '0') {
2603 zeropad = 1;
2604 f++;
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606
2607 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 width = -1;
2609 if (Py_ISDIGIT((unsigned)*f)) {
2610 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002611 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002614 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002615 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002616 return NULL;
2617 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002619 f++;
2620 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002621 }
2622 precision = -1;
2623 if (*f == '.') {
2624 f++;
2625 if (Py_ISDIGIT((unsigned)*f)) {
2626 precision = (*f - '0');
2627 f++;
2628 while (Py_ISDIGIT((unsigned)*f)) {
2629 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2630 PyErr_SetString(PyExc_ValueError,
2631 "precision too big");
2632 return NULL;
2633 }
2634 precision = (precision * 10) + (*f - '0');
2635 f++;
2636 }
2637 }
Victor Stinner96865452011-03-01 23:44:09 +00002638 if (*f == '%') {
2639 /* "%.3%s" => f points to "3" */
2640 f--;
2641 }
2642 }
2643 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002644 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002645 f--;
2646 }
Victor Stinner96865452011-03-01 23:44:09 +00002647
2648 /* Handle %ld, %lu, %lld and %llu. */
2649 longflag = 0;
2650 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002651 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002653 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002654 longflag = 1;
2655 ++f;
2656 }
Victor Stinner96865452011-03-01 23:44:09 +00002657 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002658 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002659 longlongflag = 1;
2660 f += 2;
2661 }
Victor Stinner96865452011-03-01 23:44:09 +00002662 }
2663 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002664 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002665 size_tflag = 1;
2666 ++f;
2667 }
Victor Stinnere215d962012-10-06 23:03:36 +02002668
2669 if (f[1] == '\0')
2670 writer->overallocate = 0;
2671
2672 switch (*f) {
2673 case 'c':
2674 {
2675 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002677 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002678 "character argument not in range(0x110000)");
2679 return NULL;
2680 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002681 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002682 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002683 break;
2684 }
2685
2686 case 'i':
2687 case 'd':
2688 case 'u':
2689 case 'x':
2690 {
2691 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002692 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002693 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002700 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, size_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, unsigned int));
2708 }
2709 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002711 }
2712 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002713 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002714 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002715 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002717 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002718 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002719 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, Py_ssize_t));
2722 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002723 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_arg(*vargs, int));
2725 }
2726 assert(len >= 0);
2727
Victor Stinnere215d962012-10-06 23:03:36 +02002728 if (precision < len)
2729 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730
2731 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2733 return NULL;
2734
Victor Stinnere215d962012-10-06 23:03:36 +02002735 if (width > precision) {
2736 Py_UCS4 fillchar;
2737 fill = width - precision;
2738 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002739 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2740 return NULL;
2741 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002742 }
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002744 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002745 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2746 return NULL;
2747 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002748 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749
Victor Stinner4a587072013-11-19 12:54:53 +01002750 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2751 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002752 break;
2753 }
2754
2755 case 'p':
2756 {
2757 char number[MAX_LONG_LONG_CHARS];
2758
2759 len = sprintf(number, "%p", va_arg(*vargs, void*));
2760 assert(len >= 0);
2761
2762 /* %p is ill-defined: ensure leading 0x. */
2763 if (number[1] == 'X')
2764 number[1] = 'x';
2765 else if (number[1] != 'x') {
2766 memmove(number + 2, number,
2767 strlen(number) + 1);
2768 number[0] = '0';
2769 number[1] = 'x';
2770 len += 2;
2771 }
2772
Victor Stinner4a587072013-11-19 12:54:53 +01002773 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
2775 break;
2776 }
2777
2778 case 's':
2779 {
2780 /* UTF-8 */
2781 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002784 break;
2785 }
2786
2787 case 'U':
2788 {
2789 PyObject *obj = va_arg(*vargs, PyObject *);
2790 assert(obj && _PyUnicode_CHECK(obj));
2791
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002792 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
2794 break;
2795 }
2796
2797 case 'V':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002801 if (obj) {
2802 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002804 return NULL;
2805 }
2806 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 assert(str != NULL);
2808 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002809 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002810 }
2811 break;
2812 }
2813
2814 case 'S':
2815 {
2816 PyObject *obj = va_arg(*vargs, PyObject *);
2817 PyObject *str;
2818 assert(obj);
2819 str = PyObject_Str(obj);
2820 if (!str)
2821 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002822 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002823 Py_DECREF(str);
2824 return NULL;
2825 }
2826 Py_DECREF(str);
2827 break;
2828 }
2829
2830 case 'R':
2831 {
2832 PyObject *obj = va_arg(*vargs, PyObject *);
2833 PyObject *repr;
2834 assert(obj);
2835 repr = PyObject_Repr(obj);
2836 if (!repr)
2837 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002838 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002839 Py_DECREF(repr);
2840 return NULL;
2841 }
2842 Py_DECREF(repr);
2843 break;
2844 }
2845
2846 case 'A':
2847 {
2848 PyObject *obj = va_arg(*vargs, PyObject *);
2849 PyObject *ascii;
2850 assert(obj);
2851 ascii = PyObject_ASCII(obj);
2852 if (!ascii)
2853 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002854 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 Py_DECREF(ascii);
2856 return NULL;
2857 }
2858 Py_DECREF(ascii);
2859 break;
2860 }
2861
2862 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002863 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002864 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002865 break;
2866
2867 default:
2868 /* if we stumble upon an unknown formatting code, copy the rest
2869 of the format string to the output string. (we cannot just
2870 skip the code, since there's no way to know what's in the
2871 argument list) */
2872 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002873 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002874 return NULL;
2875 f = p+len;
2876 return f;
2877 }
2878
2879 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002880 return f;
2881}
2882
Walter Dörwaldd2034312007-05-18 16:29:38 +00002883PyObject *
2884PyUnicode_FromFormatV(const char *format, va_list vargs)
2885{
Victor Stinnere215d962012-10-06 23:03:36 +02002886 va_list vargs2;
2887 const char *f;
2888 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinner8f674cc2013-04-17 23:02:17 +02002890 _PyUnicodeWriter_Init(&writer);
2891 writer.min_length = strlen(format) + 100;
2892 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002893
Benjamin Peterson0c212142016-09-20 20:39:33 -07002894 // Copy varags to be able to pass a reference to a subfunction.
2895 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002896
2897 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002898 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002899 f = unicode_fromformat_arg(&writer, f, &vargs2);
2900 if (f == NULL)
2901 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002904 const char *p;
2905 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002906
Victor Stinnere215d962012-10-06 23:03:36 +02002907 p = f;
2908 do
2909 {
2910 if ((unsigned char)*p > 127) {
2911 PyErr_Format(PyExc_ValueError,
2912 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2913 "string, got a non-ASCII byte: 0x%02x",
2914 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002915 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002916 }
2917 p++;
2918 }
2919 while (*p != '\0' && *p != '%');
2920 len = p - f;
2921
2922 if (*p == '\0')
2923 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002924
2925 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002927
2928 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002931 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002932 return _PyUnicodeWriter_Finish(&writer);
2933
2934 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002935 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002936 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938}
2939
Walter Dörwaldd2034312007-05-18 16:29:38 +00002940PyObject *
2941PyUnicode_FromFormat(const char *format, ...)
2942{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 PyObject* ret;
2944 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002945
2946#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002950#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 ret = PyUnicode_FromFormatV(format, vargs);
2952 va_end(vargs);
2953 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954}
2955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956#ifdef HAVE_WCHAR_H
2957
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002958/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959
Victor Stinnerd88d9832011-09-06 02:00:05 +02002960 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 character) required to convert the unicode object. Ignore size argument.
2962
Victor Stinnerd88d9832011-09-06 02:00:05 +02002963 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002964 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002965 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002966Py_ssize_t
2967PyUnicode_AsWideChar(PyObject *unicode,
2968 wchar_t *w,
2969 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002970{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002971 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002972 const wchar_t *wstr;
2973
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002974 if (unicode == NULL) {
2975 PyErr_BadInternalCall();
2976 return -1;
2977 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002978 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002979 if (wstr == NULL)
2980 return -1;
2981
Victor Stinner5593d8a2010-10-02 11:11:27 +00002982 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 if (size > res)
2984 size = res + 1;
2985 else
2986 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002987 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002988 return res;
2989 }
2990 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002991 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002992}
2993
Victor Stinner137c34c2010-09-29 10:25:54 +00002994wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002995PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002996 Py_ssize_t *size)
2997{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002998 const wchar_t *wstr;
2999 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003000 Py_ssize_t buflen;
3001
3002 if (unicode == NULL) {
3003 PyErr_BadInternalCall();
3004 return NULL;
3005 }
3006
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3008 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003010 }
3011 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3012 PyErr_SetString(PyExc_ValueError,
3013 "embedded null character");
3014 return NULL;
3015 }
3016
3017 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003018 if (buffer == NULL) {
3019 PyErr_NoMemory();
3020 return NULL;
3021 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003022 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003023 if (size != NULL)
3024 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003025 return buffer;
3026}
3027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003028#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
3031PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032{
Victor Stinner8faf8212011-12-08 22:14:11 +01003033 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 PyErr_SetString(PyExc_ValueError,
3035 "chr() arg not in range(0x110000)");
3036 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003037 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003038
Victor Stinner985a82a2014-01-03 12:53:47 +01003039 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003043PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003047 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003048 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003049 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 Py_INCREF(obj);
3051 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003052 }
3053 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 /* For a Unicode subtype that's not a Unicode object,
3055 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003056 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003057 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003058 PyErr_Format(PyExc_TypeError,
3059 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003060 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062}
3063
Alexander Belopolsky40018472011-02-26 01:02:56 +00003064PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003065PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003066 const char *encoding,
3067 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003070 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 PyErr_BadInternalCall();
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003077 /* Decoding bytes objects is the most common case and should be fast */
3078 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003079 if (PyBytes_GET_SIZE(obj) == 0)
3080 _Py_RETURN_UNICODE_EMPTY();
3081 v = PyUnicode_Decode(
3082 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3083 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 return v;
3085 }
3086
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003087 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 PyErr_SetString(PyExc_TypeError,
3089 "decoding str is not supported");
3090 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003091 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3094 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3095 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003096 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003097 Py_TYPE(obj)->tp_name);
3098 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003099 }
Tim Petersced69f82003-09-16 20:30:58 +00003100
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003101 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003102 PyBuffer_Release(&buffer);
3103 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003105
Serhiy Storchaka05997252013-01-26 12:14:02 +02003106 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003107 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003108 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109}
3110
Victor Stinnerebe17e02016-10-12 13:57:45 +02003111/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3112 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3113 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003114int
3115_Py_normalize_encoding(const char *encoding,
3116 char *lower,
3117 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003119 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 char *l;
3121 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003122 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123
Victor Stinner942889a2016-09-05 15:40:10 -07003124 assert(encoding != NULL);
3125
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 e = encoding;
3127 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003128 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003129 punct = 0;
3130 while (1) {
3131 char c = *e;
3132 if (c == 0) {
3133 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003134 }
Victor Stinner942889a2016-09-05 15:40:10 -07003135
3136 if (Py_ISALNUM(c) || c == '.') {
3137 if (punct && l != lower) {
3138 if (l == l_end) {
3139 return 0;
3140 }
3141 *l++ = '_';
3142 }
3143 punct = 0;
3144
3145 if (l == l_end) {
3146 return 0;
3147 }
3148 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003149 }
3150 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003151 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
Victor Stinner942889a2016-09-05 15:40:10 -07003153
3154 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
3156 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003157 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003158}
3159
Alexander Belopolsky40018472011-02-26 01:02:56 +00003160PyObject *
3161PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003162 Py_ssize_t size,
3163 const char *encoding,
3164 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003165{
3166 PyObject *buffer = NULL, *unicode;
3167 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003168 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3169
3170 if (encoding == NULL) {
3171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3172 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003173
Fred Drakee4315f52000-05-09 19:53:39 +00003174 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003175 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3176 char *lower = buflower;
3177
3178 /* Fast paths */
3179 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3180 lower += 3;
3181 if (*lower == '_') {
3182 /* Match "utf8" and "utf_8" */
3183 lower++;
3184 }
3185
3186 if (lower[0] == '8' && lower[1] == 0) {
3187 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3188 }
3189 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3190 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3191 }
3192 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3194 }
3195 }
3196 else {
3197 if (strcmp(lower, "ascii") == 0
3198 || strcmp(lower, "us_ascii") == 0) {
3199 return PyUnicode_DecodeASCII(s, size, errors);
3200 }
Steve Dowercc16be82016-09-08 10:35:16 -07003201 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003202 else if (strcmp(lower, "mbcs") == 0) {
3203 return PyUnicode_DecodeMBCS(s, size, errors);
3204 }
3205 #endif
3206 else if (strcmp(lower, "latin1") == 0
3207 || strcmp(lower, "latin_1") == 0
3208 || strcmp(lower, "iso_8859_1") == 0
3209 || strcmp(lower, "iso8859_1") == 0) {
3210 return PyUnicode_DecodeLatin1(s, size, errors);
3211 }
3212 }
Victor Stinner37296e82010-06-10 13:36:23 +00003213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214
3215 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003216 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003217 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003218 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003219 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 if (buffer == NULL)
3221 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003222 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (unicode == NULL)
3224 goto onError;
3225 if (!PyUnicode_Check(unicode)) {
3226 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003227 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3228 "use codecs.decode() to decode to arbitrary types",
3229 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003230 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 Py_DECREF(unicode);
3232 goto onError;
3233 }
3234 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003235 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003236
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 Py_XDECREF(buffer);
3239 return NULL;
3240}
3241
Alexander Belopolsky40018472011-02-26 01:02:56 +00003242PyObject *
3243PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003244 const char *encoding,
3245 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003249 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 }
3251
Serhiy Storchaka00939072016-10-27 21:05:49 +03003252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3253 "PyUnicode_AsDecodedObject() is deprecated; "
3254 "use PyCodec_Decode() to decode from str", 1) < 0)
3255 return NULL;
3256
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003259
3260 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003261 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262}
3263
Alexander Belopolsky40018472011-02-26 01:02:56 +00003264PyObject *
3265PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003266 const char *encoding,
3267 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003268{
3269 PyObject *v;
3270
3271 if (!PyUnicode_Check(unicode)) {
3272 PyErr_BadArgument();
3273 goto onError;
3274 }
3275
Serhiy Storchaka00939072016-10-27 21:05:49 +03003276 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3277 "PyUnicode_AsDecodedUnicode() is deprecated; "
3278 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3279 return NULL;
3280
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003283
3284 /* Decode via the codec registry */
3285 v = PyCodec_Decode(unicode, encoding, errors);
3286 if (v == NULL)
3287 goto onError;
3288 if (!PyUnicode_Check(v)) {
3289 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003290 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3291 "use codecs.decode() to decode to arbitrary types",
3292 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003293 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003294 Py_DECREF(v);
3295 goto onError;
3296 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003297 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003298
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003300 return NULL;
3301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
3304PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 Py_ssize_t size,
3306 const char *encoding,
3307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308{
3309 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003310
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003311 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3315 Py_DECREF(unicode);
3316 return v;
3317}
3318
Alexander Belopolsky40018472011-02-26 01:02:56 +00003319PyObject *
3320PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003321 const char *encoding,
3322 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003323{
3324 PyObject *v;
3325
3326 if (!PyUnicode_Check(unicode)) {
3327 PyErr_BadArgument();
3328 goto onError;
3329 }
3330
Serhiy Storchaka00939072016-10-27 21:05:49 +03003331 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3332 "PyUnicode_AsEncodedObject() is deprecated; "
3333 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3334 "or PyCodec_Encode() for generic encoding", 1) < 0)
3335 return NULL;
3336
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003337 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339
3340 /* Encode via the codec registry */
3341 v = PyCodec_Encode(unicode, encoding, errors);
3342 if (v == NULL)
3343 goto onError;
3344 return v;
3345
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003347 return NULL;
3348}
3349
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003350static size_t
3351wcstombs_errorpos(const wchar_t *wstr)
3352{
3353 size_t len;
3354#if SIZEOF_WCHAR_T == 2
3355 wchar_t buf[3];
3356#else
3357 wchar_t buf[2];
3358#endif
3359 char outbuf[MB_LEN_MAX];
3360 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003362#if SIZEOF_WCHAR_T == 2
3363 buf[2] = 0;
3364#else
3365 buf[1] = 0;
3366#endif
3367 start = wstr;
3368 while (*wstr != L'\0')
3369 {
3370 previous = wstr;
3371#if SIZEOF_WCHAR_T == 2
3372 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3373 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3374 {
3375 buf[0] = wstr[0];
3376 buf[1] = wstr[1];
3377 wstr += 2;
3378 }
3379 else {
3380 buf[0] = *wstr;
3381 buf[1] = 0;
3382 wstr++;
3383 }
3384#else
3385 buf[0] = *wstr;
3386 wstr++;
3387#endif
3388 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003389 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 }
3392
3393 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 return 0;
3395}
3396
Victor Stinner1b579672011-12-17 05:47:23 +01003397static int
3398locale_error_handler(const char *errors, int *surrogateescape)
3399{
Victor Stinner50149202015-09-22 00:26:54 +02003400 _Py_error_handler error_handler = get_error_handler(errors);
3401 switch (error_handler)
3402 {
3403 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003404 *surrogateescape = 0;
3405 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003406 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 1;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 default:
3410 PyErr_Format(PyExc_ValueError,
3411 "only 'strict' and 'surrogateescape' error handlers "
3412 "are supported, not '%s'",
3413 errors);
3414 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003415 }
Victor Stinner1b579672011-12-17 05:47:23 +01003416}
3417
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003419PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420{
3421 Py_ssize_t wlen, wlen2;
3422 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003424 PyObject *bytes, *reason, *exc;
3425 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003426 int surrogateescape;
3427
3428 if (locale_error_handler(errors, &surrogateescape) < 0)
3429 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003430
3431 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3432 if (wstr == NULL)
3433 return NULL;
3434
3435 wlen2 = wcslen(wstr);
3436 if (wlen2 != wlen) {
3437 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003438 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003439 return NULL;
3440 }
3441
3442 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 char *str;
3445
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003446 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 if (str == NULL) {
3448 if (error_pos == (size_t)-1) {
3449 PyErr_NoMemory();
3450 PyMem_Free(wstr);
3451 return NULL;
3452 }
3453 else {
3454 goto encode_error;
3455 }
3456 }
3457 PyMem_Free(wstr);
3458
3459 bytes = PyBytes_FromString(str);
3460 PyMem_Free(str);
3461 }
3462 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003463 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003464 size_t len, len2;
3465
3466 len = wcstombs(NULL, wstr, 0);
3467 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003468 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003469 goto encode_error;
3470 }
3471
3472 bytes = PyBytes_FromStringAndSize(NULL, len);
3473 if (bytes == NULL) {
3474 PyMem_Free(wstr);
3475 return NULL;
3476 }
3477
3478 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3479 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003480 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003481 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003482 goto encode_error;
3483 }
3484 PyMem_Free(wstr);
3485 }
3486 return bytes;
3487
3488encode_error:
3489 errmsg = strerror(errno);
3490 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003491
3492 if (error_pos == (size_t)-1)
3493 error_pos = wcstombs_errorpos(wstr);
3494
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003497 wstr = Py_DecodeLocale(errmsg, &errlen);
3498 if (wstr != NULL) {
3499 reason = PyUnicode_FromWideChar(wstr, errlen);
3500 PyMem_RawFree(wstr);
3501 } else {
3502 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003503 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003504
Victor Stinner2f197072011-12-17 07:08:30 +01003505 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003506 reason = PyUnicode_FromString(
3507 "wcstombs() encountered an unencodable "
3508 "wide character");
3509 if (reason == NULL)
3510 return NULL;
3511
3512 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3513 "locale", unicode,
3514 (Py_ssize_t)error_pos,
3515 (Py_ssize_t)(error_pos+1),
3516 reason);
3517 Py_DECREF(reason);
3518 if (exc != NULL) {
3519 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003520 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003521 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003522 return NULL;
3523}
3524
Victor Stinnerad158722010-10-27 00:25:46 +00003525PyObject *
3526PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527{
Steve Dowercc16be82016-09-08 10:35:16 -07003528#if defined(__APPLE__)
3529 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003530#else
Victor Stinner793b5312011-04-27 00:24:21 +02003531 PyInterpreterState *interp = PyThreadState_GET()->interp;
3532 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3533 cannot use it to encode and decode filenames before it is loaded. Load
3534 the Python codec requires to encode at least its own filename. Use the C
3535 version of the locale codec until the codec registry is initialized and
3536 the Python codec is loaded.
3537
3538 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3539 cannot only rely on it: check also interp->fscodec_initialized for
3540 subinterpreters. */
3541 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003542 return PyUnicode_AsEncodedString(unicode,
3543 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003544 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003545 }
3546 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003547 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
Victor Stinnerad158722010-10-27 00:25:46 +00003549#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003550}
3551
Alexander Belopolsky40018472011-02-26 01:02:56 +00003552PyObject *
3553PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003554 const char *encoding,
3555 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556{
3557 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003558 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003559
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 if (!PyUnicode_Check(unicode)) {
3561 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 }
Fred Drakee4315f52000-05-09 19:53:39 +00003564
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (encoding == NULL) {
3566 return _PyUnicode_AsUTF8String(unicode, errors);
3567 }
3568
Fred Drakee4315f52000-05-09 19:53:39 +00003569 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003570 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3571 char *lower = buflower;
3572
3573 /* Fast paths */
3574 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3575 lower += 3;
3576 if (*lower == '_') {
3577 /* Match "utf8" and "utf_8" */
3578 lower++;
3579 }
3580
3581 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003582 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003583 }
3584 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3585 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3586 }
3587 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3589 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003590 }
Victor Stinner942889a2016-09-05 15:40:10 -07003591 else {
3592 if (strcmp(lower, "ascii") == 0
3593 || strcmp(lower, "us_ascii") == 0) {
3594 return _PyUnicode_AsASCIIString(unicode, errors);
3595 }
Steve Dowercc16be82016-09-08 10:35:16 -07003596#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003597 else if (strcmp(lower, "mbcs") == 0) {
3598 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3599 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003600#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003601 else if (strcmp(lower, "latin1") == 0 ||
3602 strcmp(lower, "latin_1") == 0 ||
3603 strcmp(lower, "iso_8859_1") == 0 ||
3604 strcmp(lower, "iso8859_1") == 0) {
3605 return _PyUnicode_AsLatin1String(unicode, errors);
3606 }
3607 }
Victor Stinner37296e82010-06-10 13:36:23 +00003608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609
3610 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003611 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 return NULL;
3614
3615 /* The normal path */
3616 if (PyBytes_Check(v))
3617 return v;
3618
3619 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003621 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003622 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003623
3624 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003625 "encoder %s returned bytearray instead of bytes; "
3626 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003627 encoding);
3628 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003629 Py_DECREF(v);
3630 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003631 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003633 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3634 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003635 Py_DECREF(v);
3636 return b;
3637 }
3638
3639 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003640 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3641 "use codecs.encode() to encode to arbitrary types",
3642 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003643 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003644 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645 return NULL;
3646}
3647
Alexander Belopolsky40018472011-02-26 01:02:56 +00003648PyObject *
3649PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003650 const char *encoding,
3651 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003652{
3653 PyObject *v;
3654
3655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_BadArgument();
3657 goto onError;
3658 }
3659
Serhiy Storchaka00939072016-10-27 21:05:49 +03003660 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3661 "PyUnicode_AsEncodedUnicode() is deprecated; "
3662 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3663 return NULL;
3664
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003667
3668 /* Encode via the codec registry */
3669 v = PyCodec_Encode(unicode, encoding, errors);
3670 if (v == NULL)
3671 goto onError;
3672 if (!PyUnicode_Check(v)) {
3673 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003674 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3675 "use codecs.encode() to encode to arbitrary types",
3676 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003677 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003678 Py_DECREF(v);
3679 goto onError;
3680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003682
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 return NULL;
3685}
3686
Victor Stinner2f197072011-12-17 07:08:30 +01003687static size_t
3688mbstowcs_errorpos(const char *str, size_t len)
3689{
3690#ifdef HAVE_MBRTOWC
3691 const char *start = str;
3692 mbstate_t mbs;
3693 size_t converted;
3694 wchar_t ch;
3695
3696 memset(&mbs, 0, sizeof mbs);
3697 while (len)
3698 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003699 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003700 if (converted == 0)
3701 /* Reached end of string */
3702 break;
3703 if (converted == (size_t)-1 || converted == (size_t)-2) {
3704 /* Conversion error or incomplete character */
3705 return str - start;
3706 }
3707 else {
3708 str += converted;
3709 len -= converted;
3710 }
3711 }
3712 /* failed to find the undecodable byte sequence */
3713 return 0;
3714#endif
3715 return 0;
3716}
3717
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003720 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721{
3722 wchar_t smallbuf[256];
3723 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3724 wchar_t *wstr;
3725 size_t wlen, wlen2;
3726 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003727 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003728 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003729 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003730 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003731
3732 if (locale_error_handler(errors, &surrogateescape) < 0)
3733 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003734
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003735 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3736 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737 return NULL;
3738 }
3739
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003740 if (surrogateescape) {
3741 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003742 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003743 if (wstr == NULL) {
3744 if (wlen == (size_t)-1)
3745 PyErr_NoMemory();
3746 else
3747 PyErr_SetFromErrno(PyExc_OSError);
3748 return NULL;
3749 }
3750
3751 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003752 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753 }
3754 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003755 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756#ifndef HAVE_BROKEN_MBSTOWCS
3757 wlen = mbstowcs(NULL, str, 0);
3758#else
3759 wlen = len;
3760#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003761 if (wlen == (size_t)-1)
3762 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 if (wlen+1 <= smallbuf_len) {
3764 wstr = smallbuf;
3765 }
3766 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003767 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003768 if (!wstr)
3769 return PyErr_NoMemory();
3770 }
3771
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003772 wlen2 = mbstowcs(wstr, str, wlen+1);
3773 if (wlen2 == (size_t)-1) {
3774 if (wstr != smallbuf)
3775 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003776 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003777 }
3778#ifdef HAVE_BROKEN_MBSTOWCS
3779 assert(wlen2 == wlen);
3780#endif
3781 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3782 if (wstr != smallbuf)
3783 PyMem_Free(wstr);
3784 }
3785 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003786
3787decode_error:
3788 errmsg = strerror(errno);
3789 assert(errmsg != NULL);
3790
3791 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003792 wstr = Py_DecodeLocale(errmsg, &errlen);
3793 if (wstr != NULL) {
3794 reason = PyUnicode_FromWideChar(wstr, errlen);
3795 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003796 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003797
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003798 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003799 reason = PyUnicode_FromString(
3800 "mbstowcs() encountered an invalid multibyte sequence");
3801 if (reason == NULL)
3802 return NULL;
3803
3804 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3805 "locale", str, len,
3806 (Py_ssize_t)error_pos,
3807 (Py_ssize_t)(error_pos+1),
3808 reason);
3809 Py_DECREF(reason);
3810 if (exc != NULL) {
3811 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003812 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003813 }
3814 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003815}
3816
3817PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003818PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003819{
3820 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003821 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003822}
3823
3824
3825PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003826PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003827 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003828 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3829}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003830
Christian Heimes5894ba72007-11-04 11:43:14 +00003831PyObject*
3832PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3833{
Steve Dowercc16be82016-09-08 10:35:16 -07003834#if defined(__APPLE__)
3835 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003836#else
Victor Stinner793b5312011-04-27 00:24:21 +02003837 PyInterpreterState *interp = PyThreadState_GET()->interp;
3838 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3839 cannot use it to encode and decode filenames before it is loaded. Load
3840 the Python codec requires to encode at least its own filename. Use the C
3841 version of the locale codec until the codec registry is initialized and
3842 the Python codec is loaded.
3843
3844 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3845 cannot only rely on it: check also interp->fscodec_initialized for
3846 subinterpreters. */
3847 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003848 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003850 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851 }
3852 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003853 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854 }
Victor Stinnerad158722010-10-27 00:25:46 +00003855#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856}
3857
Martin v. Löwis011e8422009-05-05 04:43:17 +00003858
3859int
3860PyUnicode_FSConverter(PyObject* arg, void* addr)
3861{
Brett Cannonec6ce872016-09-06 15:50:29 -07003862 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003863 PyObject *output = NULL;
3864 Py_ssize_t size;
3865 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003866 if (arg == NULL) {
3867 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003868 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003869 return 1;
3870 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003873 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003874 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003875 if (PyBytes_Check(path)) {
3876 output = path;
3877 }
3878 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3879 output = PyUnicode_EncodeFSDefault(path);
3880 Py_DECREF(path);
3881 if (!output) {
3882 return 0;
3883 }
3884 assert(PyBytes_Check(output));
3885 }
3886
Victor Stinner0ea2a462010-04-30 00:22:08 +00003887 size = PyBytes_GET_SIZE(output);
3888 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003889 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003890 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003891 Py_DECREF(output);
3892 return 0;
3893 }
3894 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003895 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896}
3897
3898
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003899int
3900PyUnicode_FSDecoder(PyObject* arg, void* addr)
3901{
Brett Cannona5711202016-09-06 19:36:01 -07003902 int is_buffer = 0;
3903 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003904 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 if (arg == NULL) {
3906 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003907 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003908 return 1;
3909 }
Brett Cannona5711202016-09-06 19:36:01 -07003910
3911 is_buffer = PyObject_CheckBuffer(arg);
3912 if (!is_buffer) {
3913 path = PyOS_FSPath(arg);
3914 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003915 return 0;
3916 }
Brett Cannona5711202016-09-06 19:36:01 -07003917 }
3918 else {
3919 path = arg;
3920 Py_INCREF(arg);
3921 }
3922
3923 if (PyUnicode_Check(path)) {
3924 if (PyUnicode_READY(path) == -1) {
3925 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003927 }
3928 output = path;
3929 }
3930 else if (PyBytes_Check(path) || is_buffer) {
3931 PyObject *path_bytes = NULL;
3932
3933 if (!PyBytes_Check(path) &&
3934 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3935 "path should be string, bytes, or os.PathLike, not %.200s",
3936 Py_TYPE(arg)->tp_name)) {
3937 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003938 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 path_bytes = PyBytes_FromObject(path);
3941 Py_DECREF(path);
3942 if (!path_bytes) {
3943 return 0;
3944 }
3945 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3946 PyBytes_GET_SIZE(path_bytes));
3947 Py_DECREF(path_bytes);
3948 if (!output) {
3949 return 0;
3950 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003951 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003952 else {
3953 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003954 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003956 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003957 return 0;
3958 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003959 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003960 Py_DECREF(output);
3961 return 0;
3962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003964 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003965 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003966 Py_DECREF(output);
3967 return 0;
3968 }
3969 *(PyObject**)addr = output;
3970 return Py_CLEANUP_SUPPORTED;
3971}
3972
3973
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003974const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003976{
Christian Heimesf3863112007-11-22 07:46:41 +00003977 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 return NULL;
3982 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003984 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003986 if (PyUnicode_UTF8(unicode) == NULL) {
3987 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003988 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 if (bytes == NULL)
3990 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003991 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3992 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003993 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 Py_DECREF(bytes);
3995 return NULL;
3996 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003998 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 PyBytes_AS_STRING(bytes),
4000 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 Py_DECREF(bytes);
4002 }
4003
4004 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004005 *psize = PyUnicode_UTF8_LENGTH(unicode);
4006 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004007}
4008
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004009const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4013}
4014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015Py_UNICODE *
4016PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 const unsigned char *one_byte;
4019#if SIZEOF_WCHAR_T == 4
4020 const Py_UCS2 *two_bytes;
4021#else
4022 const Py_UCS4 *four_bytes;
4023 const Py_UCS4 *ucs4_end;
4024 Py_ssize_t num_surrogates;
4025#endif
4026 wchar_t *w;
4027 wchar_t *wchar_end;
4028
4029 if (!PyUnicode_Check(unicode)) {
4030 PyErr_BadArgument();
4031 return NULL;
4032 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004035 assert(_PyUnicode_KIND(unicode) != 0);
4036 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4041 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 num_surrogates = 0;
4043
4044 for (; four_bytes < ucs4_end; ++four_bytes) {
4045 if (*four_bytes > 0xFFFF)
4046 ++num_surrogates;
4047 }
4048
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004049 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4050 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4051 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 PyErr_NoMemory();
4053 return NULL;
4054 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 w = _PyUnicode_WSTR(unicode);
4058 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4059 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4061 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004062 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004064 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4065 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 }
4067 else
4068 *w = *four_bytes;
4069
4070 if (w > wchar_end) {
4071 assert(0 && "Miscalculated string end");
4072 }
4073 }
4074 *w = 0;
4075#else
4076 /* sizeof(wchar_t) == 4 */
4077 Py_FatalError("Impossible unicode object state, wstr and str "
4078 "should share memory already.");
4079 return NULL;
4080#endif
4081 }
4082 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004083 if ((size_t)_PyUnicode_LENGTH(unicode) >
4084 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4085 PyErr_NoMemory();
4086 return NULL;
4087 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004088 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4089 (_PyUnicode_LENGTH(unicode) + 1));
4090 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 PyErr_NoMemory();
4092 return NULL;
4093 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004094 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4095 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4096 w = _PyUnicode_WSTR(unicode);
4097 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004099 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4100 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 for (; w < wchar_end; ++one_byte, ++w)
4102 *w = *one_byte;
4103 /* null-terminate the wstr */
4104 *w = 0;
4105 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004108 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 for (; w < wchar_end; ++two_bytes, ++w)
4110 *w = *two_bytes;
4111 /* null-terminate the wstr */
4112 *w = 0;
4113#else
4114 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004115 PyObject_FREE(_PyUnicode_WSTR(unicode));
4116 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 Py_FatalError("Impossible unicode object state, wstr "
4118 "and str should share memory already.");
4119 return NULL;
4120#endif
4121 }
4122 else {
4123 assert(0 && "This should never happen.");
4124 }
4125 }
4126 }
4127 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004128 *size = PyUnicode_WSTR_LENGTH(unicode);
4129 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004130}
4131
Alexander Belopolsky40018472011-02-26 01:02:56 +00004132Py_UNICODE *
4133PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136}
4137
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004138const Py_UNICODE *
4139_PyUnicode_AsUnicode(PyObject *unicode)
4140{
4141 Py_ssize_t size;
4142 const Py_UNICODE *wstr;
4143
4144 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4145 if (wstr && wcslen(wstr) != (size_t)size) {
4146 PyErr_SetString(PyExc_ValueError, "embedded null character");
4147 return NULL;
4148 }
4149 return wstr;
4150}
4151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153Py_ssize_t
4154PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155{
4156 if (!PyUnicode_Check(unicode)) {
4157 PyErr_BadArgument();
4158 goto onError;
4159 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004160 if (_PyUnicode_WSTR(unicode) == NULL) {
4161 if (PyUnicode_AsUnicode(unicode) == NULL)
4162 goto onError;
4163 }
4164 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 return -1;
4168}
4169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170Py_ssize_t
4171PyUnicode_GetLength(PyObject *unicode)
4172{
Victor Stinner07621332012-06-16 04:53:46 +02004173 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174 PyErr_BadArgument();
4175 return -1;
4176 }
Victor Stinner07621332012-06-16 04:53:46 +02004177 if (PyUnicode_READY(unicode) == -1)
4178 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 return PyUnicode_GET_LENGTH(unicode);
4180}
4181
4182Py_UCS4
4183PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4184{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004185 void *data;
4186 int kind;
4187
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004188 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004189 PyErr_BadArgument();
4190 return (Py_UCS4)-1;
4191 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004192 if (PyUnicode_READY(unicode) == -1) {
4193 return (Py_UCS4)-1;
4194 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004195 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004196 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 return (Py_UCS4)-1;
4198 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004199 data = PyUnicode_DATA(unicode);
4200 kind = PyUnicode_KIND(unicode);
4201 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202}
4203
4204int
4205PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4206{
4207 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004208 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 return -1;
4210 }
Victor Stinner488fa492011-12-12 00:01:39 +01004211 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004212 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004213 PyErr_SetString(PyExc_IndexError, "string index out of range");
4214 return -1;
4215 }
Victor Stinner488fa492011-12-12 00:01:39 +01004216 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004217 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004218 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4219 PyErr_SetString(PyExc_ValueError, "character out of range");
4220 return -1;
4221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4223 index, ch);
4224 return 0;
4225}
4226
Alexander Belopolsky40018472011-02-26 01:02:56 +00004227const char *
4228PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004229{
Victor Stinner42cb4622010-09-01 19:39:01 +00004230 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004231}
4232
Victor Stinner554f3f02010-06-16 23:33:54 +00004233/* create or adjust a UnicodeDecodeError */
4234static void
4235make_decode_exception(PyObject **exceptionObject,
4236 const char *encoding,
4237 const char *input, Py_ssize_t length,
4238 Py_ssize_t startpos, Py_ssize_t endpos,
4239 const char *reason)
4240{
4241 if (*exceptionObject == NULL) {
4242 *exceptionObject = PyUnicodeDecodeError_Create(
4243 encoding, input, length, startpos, endpos, reason);
4244 }
4245 else {
4246 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4247 goto onError;
4248 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4249 goto onError;
4250 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4251 goto onError;
4252 }
4253 return;
4254
4255onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004256 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004257}
4258
Steve Dowercc16be82016-09-08 10:35:16 -07004259#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260/* error handling callback helper:
4261 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004262 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 and adjust various state variables.
4264 return 0 on success, -1 on error
4265*/
4266
Alexander Belopolsky40018472011-02-26 01:02:56 +00004267static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268unicode_decode_call_errorhandler_wchar(
4269 const char *errors, PyObject **errorHandler,
4270 const char *encoding, const char *reason,
4271 const char **input, const char **inend, Py_ssize_t *startinpos,
4272 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4273 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004275 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276
4277 PyObject *restuple = NULL;
4278 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004279 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004280 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004281 Py_ssize_t requiredsize;
4282 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 wchar_t *repwstr;
4285 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4288 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 *errorHandler = PyCodec_LookupError(errors);
4292 if (*errorHandler == NULL)
4293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
4295
Victor Stinner554f3f02010-06-16 23:33:54 +00004296 make_decode_exception(exceptionObject,
4297 encoding,
4298 *input, *inend - *input,
4299 *startinpos, *endinpos,
4300 reason);
4301 if (*exceptionObject == NULL)
4302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004304 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004311 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004313
4314 /* Copy back the bytes variables, which might have been modified by the
4315 callback */
4316 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4317 if (!inputobj)
4318 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004319 *input = PyBytes_AS_STRING(inputobj);
4320 insize = PyBytes_GET_SIZE(inputobj);
4321 *inend = *input + insize;
4322 /* we can DECREF safely, as the exception has another reference,
4323 so the object won't go away. */
4324 Py_DECREF(inputobj);
4325
4326 if (newpos<0)
4327 newpos = insize+newpos;
4328 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004329 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 goto onError;
4331 }
4332
4333 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4334 if (repwstr == NULL)
4335 goto onError;
4336 /* need more space? (at least enough for what we
4337 have+the replacement+the rest of the string (starting
4338 at the new input position), so we won't have to check space
4339 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004340 requiredsize = *outpos;
4341 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4342 goto overflow;
4343 requiredsize += repwlen;
4344 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4345 goto overflow;
4346 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004348 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 requiredsize = 2*outsize;
4350 if (unicode_resize(output, requiredsize) < 0)
4351 goto onError;
4352 }
4353 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4354 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355 *endinpos = newpos;
4356 *inptr = *input + newpos;
4357
4358 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004359 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 return 0;
4361
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004362 overflow:
4363 PyErr_SetString(PyExc_OverflowError,
4364 "decoded result is too long for a Python string");
4365
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 onError:
4367 Py_XDECREF(restuple);
4368 return -1;
4369}
Steve Dowercc16be82016-09-08 10:35:16 -07004370#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371
4372static int
4373unicode_decode_call_errorhandler_writer(
4374 const char *errors, PyObject **errorHandler,
4375 const char *encoding, const char *reason,
4376 const char **input, const char **inend, Py_ssize_t *startinpos,
4377 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4378 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4379{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004380 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004381
4382 PyObject *restuple = NULL;
4383 PyObject *repunicode = NULL;
4384 Py_ssize_t insize;
4385 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004386 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 PyObject *inputobj = NULL;
4388
4389 if (*errorHandler == NULL) {
4390 *errorHandler = PyCodec_LookupError(errors);
4391 if (*errorHandler == NULL)
4392 goto onError;
4393 }
4394
4395 make_decode_exception(exceptionObject,
4396 encoding,
4397 *input, *inend - *input,
4398 *startinpos, *endinpos,
4399 reason);
4400 if (*exceptionObject == NULL)
4401 goto onError;
4402
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004403 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 if (restuple == NULL)
4405 goto onError;
4406 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004407 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 goto onError;
4409 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004410 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004412
4413 /* Copy back the bytes variables, which might have been modified by the
4414 callback */
4415 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4416 if (!inputobj)
4417 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004418 *input = PyBytes_AS_STRING(inputobj);
4419 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004420 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004421 /* we can DECREF safely, as the exception has another reference,
4422 so the object won't go away. */
4423 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004427 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004428 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431
Victor Stinner170ca6f2013-04-18 00:25:28 +02004432 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004433 if (replen > 1) {
4434 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004435 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004436 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4437 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4438 goto onError;
4439 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004440 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004441 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004444 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004447 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453}
4454
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455/* --- UTF-7 Codec -------------------------------------------------------- */
4456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457/* See RFC2152 for details. We encode conservatively and decode liberally. */
4458
4459/* Three simple macros defining base-64. */
4460
4461/* Is c a base-64 character? */
4462
4463#define IS_BASE64(c) \
4464 (((c) >= 'A' && (c) <= 'Z') || \
4465 ((c) >= 'a' && (c) <= 'z') || \
4466 ((c) >= '0' && (c) <= '9') || \
4467 (c) == '+' || (c) == '/')
4468
4469/* given that c is a base-64 character, what is its base-64 value? */
4470
4471#define FROM_BASE64(c) \
4472 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4473 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4474 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4475 (c) == '+' ? 62 : 63)
4476
4477/* What is the base-64 character of the bottom 6 bits of n? */
4478
4479#define TO_BASE64(n) \
4480 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4481
4482/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4483 * decoded as itself. We are permissive on decoding; the only ASCII
4484 * byte not decoding to itself is the + which begins a base64
4485 * string. */
4486
4487#define DECODE_DIRECT(c) \
4488 ((c) <= 127 && (c) != '+')
4489
4490/* The UTF-7 encoder treats ASCII characters differently according to
4491 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4492 * the above). See RFC2152. This array identifies these different
4493 * sets:
4494 * 0 : "Set D"
4495 * alphanumeric and '(),-./:?
4496 * 1 : "Set O"
4497 * !"#$%&*;<=>@[]^_`{|}
4498 * 2 : "whitespace"
4499 * ht nl cr sp
4500 * 3 : special (must be base64 encoded)
4501 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4502 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Tim Petersced69f82003-09-16 20:30:58 +00004504static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505char utf7_category[128] = {
4506/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4507 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4508/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4509 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4510/* sp ! " # $ % & ' ( ) * + , - . / */
4511 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4512/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4514/* @ A B C D E F G H I J K L M N O */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4518/* ` a b c d e f g h i j k l m n o */
4519 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4520/* p q r s t u v w x y z { | } ~ del */
4521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522};
4523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524/* ENCODE_DIRECT: this character should be encoded as itself. The
4525 * answer depends on whether we are encoding set O as itself, and also
4526 * on whether we are encoding whitespace as itself. RFC2152 makes it
4527 * clear that the answers to these questions vary between
4528 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004529
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530#define ENCODE_DIRECT(c, directO, directWS) \
4531 ((c) < 128 && (c) > 0 && \
4532 ((utf7_category[(c)] == 0) || \
4533 (directWS && (utf7_category[(c)] == 2)) || \
4534 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535
Alexander Belopolsky40018472011-02-26 01:02:56 +00004536PyObject *
4537PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004538 Py_ssize_t size,
4539 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4542}
4543
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544/* The decoder. The only state we preserve is our read position,
4545 * i.e. how many characters we have consumed. So if we end in the
4546 * middle of a shift sequence we have to back off the read position
4547 * and the output to the beginning of the sequence, otherwise we lose
4548 * all the shift state (seen bits, number of bits seen, high
4549 * surrogate). */
4550
Alexander Belopolsky40018472011-02-26 01:02:56 +00004551PyObject *
4552PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004553 Py_ssize_t size,
4554 const char *errors,
4555 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004558 Py_ssize_t startinpos;
4559 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 const char *errmsg = "";
4563 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004564 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 unsigned int base64bits = 0;
4566 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004567 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004570
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004571 if (size == 0) {
4572 if (consumed)
4573 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004574 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004578 _PyUnicodeWriter_Init(&writer);
4579 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004580
4581 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 e = s + size;
4583
4584 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004587 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 if (inShift) { /* in a base-64 section */
4590 if (IS_BASE64(ch)) { /* consume a base-64 character */
4591 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4592 base64bits += 6;
4593 s++;
4594 if (base64bits >= 16) {
4595 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004596 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 base64bits -= 16;
4598 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004599 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 if (surrogate) {
4601 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004602 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4603 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004607 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004610 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004611 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 }
4614 }
Victor Stinner551ac952011-11-29 22:58:13 +01004615 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 /* first surrogate */
4617 surrogate = outCh;
4618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004620 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 }
4623 }
4624 }
4625 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 if (base64bits > 0) { /* left-over bits */
4628 if (base64bits >= 6) {
4629 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 errmsg = "partial character in shift sequence";
4632 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else {
4635 /* Some bits remain; they should be zero */
4636 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004637 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 errmsg = "non-zero padding bits in shift sequence";
4639 goto utf7Error;
4640 }
4641 }
4642 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 if (surrogate && DECODE_DIRECT(ch)) {
4644 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4645 goto onError;
4646 }
4647 surrogate = 0;
4648 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 /* '-' is absorbed; other terminating
4650 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004651 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
4655 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 s++; /* consume '+' */
4658 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004660 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004661 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 }
4663 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004665 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004668 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004673 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 else {
4677 startinpos = s-starts;
4678 s++;
4679 errmsg = "unexpected special character";
4680 goto utf7Error;
4681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 errors, &errorHandler,
4687 "utf7", errmsg,
4688 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 }
4692
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 /* end of string */
4694
4695 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4696 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004697 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 if (surrogate ||
4699 (base64bits >= 6) ||
4700 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004702 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 errors, &errorHandler,
4704 "utf7", "unterminated shift sequence",
4705 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 goto onError;
4708 if (s < e)
4709 goto restart;
4710 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712
4713 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004715 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004717 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004718 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004719 writer.kind, writer.data, shiftOutStart);
4720 Py_XDECREF(errorHandler);
4721 Py_XDECREF(exc);
4722 _PyUnicodeWriter_Dealloc(&writer);
4723 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004724 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004725 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 }
4727 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004728 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004730 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(errorHandler);
4733 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_XDECREF(errorHandler);
4738 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 return NULL;
4741}
4742
4743
Alexander Belopolsky40018472011-02-26 01:02:56 +00004744PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004745_PyUnicode_EncodeUTF7(PyObject *str,
4746 int base64SetO,
4747 int base64WhiteSpace,
4748 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 int kind;
4751 void *data;
4752 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004753 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 unsigned int base64bits = 0;
4757 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 char * out;
4759 char * start;
4760
Benjamin Petersonbac79492012-01-14 13:34:47 -05004761 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 return NULL;
4763 kind = PyUnicode_KIND(str);
4764 data = PyUnicode_DATA(str);
4765 len = PyUnicode_GET_LENGTH(str);
4766
4767 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004770 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004771 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004772 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004773 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 if (v == NULL)
4775 return NULL;
4776
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004777 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004778 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004779 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 if (inShift) {
4782 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4783 /* shifting out */
4784 if (base64bits) { /* output remaining bits */
4785 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786 base64buffer = 0;
4787 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 }
4789 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 /* Characters not in the BASE64 set implicitly unshift the sequence
4791 so no '-' is required, except if the character is itself a '-' */
4792 if (IS_BASE64(ch) || ch == '-') {
4793 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004794 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 *out++ = (char) ch;
4796 }
4797 else {
4798 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004799 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 else { /* not in a shift sequence */
4802 if (ch == '+') {
4803 *out++ = '+';
4804 *out++ = '-';
4805 }
4806 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4807 *out++ = (char) ch;
4808 }
4809 else {
4810 *out++ = '+';
4811 inShift = 1;
4812 goto encode_char;
4813 }
4814 }
4815 continue;
4816encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004818 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004819
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 /* code first surrogate */
4821 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004822 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 while (base64bits >= 6) {
4824 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825 base64bits -= 6;
4826 }
4827 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004828 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830 base64bits += 16;
4831 base64buffer = (base64buffer << 16) | ch;
4832 while (base64bits >= 6) {
4833 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834 base64bits -= 6;
4835 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004836 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 if (base64bits)
4838 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004841 if (_PyBytes_Resize(&v, out - start) < 0)
4842 return NULL;
4843 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004844}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004845PyObject *
4846PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4847 Py_ssize_t size,
4848 int base64SetO,
4849 int base64WhiteSpace,
4850 const char *errors)
4851{
4852 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004853 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004854 if (tmp == NULL)
4855 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004856 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004857 base64WhiteSpace, errors);
4858 Py_DECREF(tmp);
4859 return result;
4860}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004861
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862#undef IS_BASE64
4863#undef FROM_BASE64
4864#undef TO_BASE64
4865#undef DECODE_DIRECT
4866#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868/* --- UTF-8 Codec -------------------------------------------------------- */
4869
Alexander Belopolsky40018472011-02-26 01:02:56 +00004870PyObject *
4871PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004872 Py_ssize_t size,
4873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Walter Dörwald69652032004-09-07 20:24:22 +00004875 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4876}
4877
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878#include "stringlib/asciilib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004882#include "stringlib/ucs1lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs2lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
4890#include "stringlib/ucs4lib.h"
4891#include "stringlib/codecs.h"
4892#include "stringlib/undef.h"
4893
Antoine Pitrouab868312009-01-10 15:40:25 +00004894/* Mask to quickly check whether a C 'long' contains a
4895 non-ASCII, UTF8-encoded char. */
4896#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004897# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004898#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004899# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004900#else
4901# error C 'long' size should be either 4 or 8!
4902#endif
4903
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904static Py_ssize_t
4905ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004908 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004910 /*
4911 * Issue #17237: m68k is a bit different from most architectures in
4912 * that objects do not use "natural alignment" - for example, int and
4913 * long are only aligned at 2-byte boundaries. Therefore the assert()
4914 * won't work; also, tests have shown that skipping the "optimised
4915 * version" will even speed up m68k.
4916 */
4917#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004919 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4920 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 /* Fast path, see in STRINGLIB(utf8_decode) for
4922 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004923 /* Help allocation */
4924 const char *_p = p;
4925 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 while (_p < aligned_end) {
4927 unsigned long value = *(const unsigned long *) _p;
4928 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 *((unsigned long *)q) = value;
4931 _p += SIZEOF_LONG;
4932 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004933 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 p = _p;
4935 while (p < end) {
4936 if ((unsigned char)*p & 0x80)
4937 break;
4938 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004943#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 while (p < end) {
4945 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4946 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004947 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004948 /* Help allocation */
4949 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 while (_p < aligned_end) {
4951 unsigned long value = *(unsigned long *) _p;
4952 if (value & ASCII_CHAR_MASK)
4953 break;
4954 _p += SIZEOF_LONG;
4955 }
4956 p = _p;
4957 if (_p == end)
4958 break;
4959 }
4960 if ((unsigned char)*p & 0x80)
4961 break;
4962 ++p;
4963 }
4964 memcpy(dest, start, p - start);
4965 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966}
Antoine Pitrouab868312009-01-10 15:40:25 +00004967
Victor Stinner785938e2011-12-11 20:09:03 +01004968PyObject *
4969PyUnicode_DecodeUTF8Stateful(const char *s,
4970 Py_ssize_t size,
4971 const char *errors,
4972 Py_ssize_t *consumed)
4973{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004975 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977
4978 Py_ssize_t startinpos;
4979 Py_ssize_t endinpos;
4980 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004981 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004983 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004984
4985 if (size == 0) {
4986 if (consumed)
4987 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004988 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004989 }
4990
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4992 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004993 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 *consumed = 1;
4995 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004996 }
4997
Victor Stinner8f674cc2013-04-17 23:02:17 +02004998 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004999 writer.min_length = size;
5000 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01005002
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 writer.pos = ascii_decode(s, end, writer.data);
5004 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 while (s < end) {
5006 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005008
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 if (PyUnicode_IS_ASCII(writer.buffer))
5011 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 } else {
5017 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005018 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 }
5020
5021 switch (ch) {
5022 case 0:
5023 if (s == end || consumed)
5024 goto End;
5025 errmsg = "unexpected end of data";
5026 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005027 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 break;
5029 case 1:
5030 errmsg = "invalid start byte";
5031 startinpos = s - starts;
5032 endinpos = startinpos + 1;
5033 break;
5034 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005035 case 3:
5036 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 errmsg = "invalid continuation byte";
5038 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005039 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 break;
5041 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005042 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 goto onError;
5044 continue;
5045 }
5046
Victor Stinner1d65d912015-10-05 13:43:50 +02005047 if (error_handler == _Py_ERROR_UNKNOWN)
5048 error_handler = get_error_handler(errors);
5049
5050 switch (error_handler) {
5051 case _Py_ERROR_IGNORE:
5052 s += (endinpos - startinpos);
5053 break;
5054
5055 case _Py_ERROR_REPLACE:
5056 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5057 goto onError;
5058 s += (endinpos - startinpos);
5059 break;
5060
5061 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005062 {
5063 Py_ssize_t i;
5064
Victor Stinner1d65d912015-10-05 13:43:50 +02005065 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5066 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005067 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005068 ch = (Py_UCS4)(unsigned char)(starts[i]);
5069 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5070 ch + 0xdc00);
5071 writer.pos++;
5072 }
5073 s += (endinpos - startinpos);
5074 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005075 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005076
5077 default:
5078 if (unicode_decode_call_errorhandler_writer(
5079 errors, &error_handler_obj,
5080 "utf-8", errmsg,
5081 &starts, &end, &startinpos, &endinpos, &exc, &s,
5082 &writer))
5083 goto onError;
5084 }
Victor Stinner785938e2011-12-11 20:09:03 +01005085 }
5086
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 if (consumed)
5089 *consumed = s - starts;
5090
Victor Stinner1d65d912015-10-05 13:43:50 +02005091 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005093 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094
5095onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005096 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005098 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005100}
5101
Xavier de Gaye76febd02016-12-15 20:59:58 +01005102#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103
5104/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005105 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005106
5107 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005108 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109
5110wchar_t*
5111_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5112{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 wchar_t *unicode;
5115 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116
5117 /* Note: size will always be longer than the resulting Unicode
5118 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005119 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005120 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005121 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 if (!unicode)
5123 return NULL;
5124
5125 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005127 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 if (ch > 0xFF) {
5136#if SIZEOF_WCHAR_T == 4
5137 assert(0);
5138#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005139 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 /* compute and append the two surrogates: */
5141 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5142 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5143#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 else {
5146 if (!ch && s == e)
5147 break;
5148 /* surrogateescape */
5149 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5150 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005151 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005153 return unicode;
5154}
5155
Xavier de Gaye76febd02016-12-15 20:59:58 +01005156#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158/* Primary internal function which creates utf8 encoded bytes objects.
5159
5160 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005161 and allocate exactly as much space needed at the end. Else allocate the
5162 maximum possible needed (4 result bytes per Unicode character), and return
5163 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005164*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005165PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005166_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167{
Victor Stinner6099a032011-12-18 14:22:26 +01005168 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169 void *data;
5170 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 if (!PyUnicode_Check(unicode)) {
5173 PyErr_BadArgument();
5174 return NULL;
5175 }
5176
5177 if (PyUnicode_READY(unicode) == -1)
5178 return NULL;
5179
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005180 if (PyUnicode_UTF8(unicode))
5181 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5182 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005183
5184 kind = PyUnicode_KIND(unicode);
5185 data = PyUnicode_DATA(unicode);
5186 size = PyUnicode_GET_LENGTH(unicode);
5187
Benjamin Petersonead6b532011-12-20 17:23:42 -06005188 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005189 default:
5190 assert(0);
5191 case PyUnicode_1BYTE_KIND:
5192 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5193 assert(!PyUnicode_IS_ASCII(unicode));
5194 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5195 case PyUnicode_2BYTE_KIND:
5196 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5197 case PyUnicode_4BYTE_KIND:
5198 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200}
5201
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005203PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5204 Py_ssize_t size,
5205 const char *errors)
5206{
5207 PyObject *v, *unicode;
5208
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005209 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210 if (unicode == NULL)
5211 return NULL;
5212 v = _PyUnicode_AsUTF8String(unicode, errors);
5213 Py_DECREF(unicode);
5214 return v;
5215}
5216
5217PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221}
5222
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223/* --- UTF-32 Codec ------------------------------------------------------- */
5224
5225PyObject *
5226PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 Py_ssize_t size,
5228 const char *errors,
5229 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230{
5231 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5232}
5233
5234PyObject *
5235PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 Py_ssize_t size,
5237 const char *errors,
5238 int *byteorder,
5239 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005240{
5241 const char *starts = s;
5242 Py_ssize_t startinpos;
5243 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005245 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005247 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249 PyObject *errorHandler = NULL;
5250 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005251
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 q = (unsigned char *)s;
5253 e = q + size;
5254
5255 if (byteorder)
5256 bo = *byteorder;
5257
5258 /* Check for BOM marks (U+FEFF) in the input and adjust current
5259 byte order setting accordingly. In native mode, the leading BOM
5260 mark is skipped, in all other modes, it is copied to the output
5261 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005262 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005263 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 if (bom == 0x0000FEFF) {
5265 bo = -1;
5266 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005268 else if (bom == 0xFFFE0000) {
5269 bo = 1;
5270 q += 4;
5271 }
5272 if (byteorder)
5273 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274 }
5275
Victor Stinnere64322e2012-10-30 23:12:47 +01005276 if (q == e) {
5277 if (consumed)
5278 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005279 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280 }
5281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282#ifdef WORDS_BIGENDIAN
5283 le = bo < 0;
5284#else
5285 le = bo <= 0;
5286#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005287 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005288
Victor Stinner8f674cc2013-04-17 23:02:17 +02005289 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005290 writer.min_length = (e - q + 3) / 4;
5291 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005293
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 while (1) {
5295 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005297
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 enum PyUnicode_Kind kind = writer.kind;
5300 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (le) {
5304 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005305 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (ch > maxch)
5307 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005308 if (kind != PyUnicode_1BYTE_KIND &&
5309 Py_UNICODE_IS_SURROGATE(ch))
5310 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 q += 4;
5313 } while (q <= last);
5314 }
5315 else {
5316 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005317 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 if (ch > maxch)
5319 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005320 if (kind != PyUnicode_1BYTE_KIND &&
5321 Py_UNICODE_IS_SURROGATE(ch))
5322 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 q += 4;
5325 } while (q <= last);
5326 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 }
5329
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005330 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005331 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005332 startinpos = ((const char *)q) - starts;
5333 endinpos = startinpos + 4;
5334 }
5335 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005338 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 startinpos = ((const char *)q) - starts;
5341 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 else {
5344 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005345 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 goto onError;
5347 q += 4;
5348 continue;
5349 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005350 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005351 startinpos = ((const char *)q) - starts;
5352 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005354
5355 /* The remaining input chars are ignored if the callback
5356 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005359 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 }
5364
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 Py_XDECREF(errorHandler);
5369 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374 Py_XDECREF(errorHandler);
5375 Py_XDECREF(exc);
5376 return NULL;
5377}
5378
5379PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380_PyUnicode_EncodeUTF32(PyObject *str,
5381 const char *errors,
5382 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005384 enum PyUnicode_Kind kind;
5385 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005386 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005388 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005389#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005391#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005394 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 PyObject *errorHandler = NULL;
5397 PyObject *exc = NULL;
5398 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 if (!PyUnicode_Check(str)) {
5401 PyErr_BadArgument();
5402 return NULL;
5403 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005404 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 return NULL;
5406 kind = PyUnicode_KIND(str);
5407 data = PyUnicode_DATA(str);
5408 len = PyUnicode_GET_LENGTH(str);
5409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005411 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005413 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414 if (v == NULL)
5415 return NULL;
5416
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 /* output buffer is 4-bytes aligned */
5418 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005419 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005422 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005427 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 else
5430 encoding = "utf-32";
5431
5432 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5434 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005435 }
5436
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437 pos = 0;
5438 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440
5441 if (kind == PyUnicode_2BYTE_KIND) {
5442 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5443 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445 else {
5446 assert(kind == PyUnicode_4BYTE_KIND);
5447 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5448 &out, native_ordering);
5449 }
5450 if (pos == len)
5451 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005452
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 rep = unicode_encode_call_errorhandler(
5454 errors, &errorHandler,
5455 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005456 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005457 if (!rep)
5458 goto error;
5459
5460 if (PyBytes_Check(rep)) {
5461 repsize = PyBytes_GET_SIZE(rep);
5462 if (repsize & 3) {
5463 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005464 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 "surrogates not allowed");
5466 goto error;
5467 }
5468 moreunits = repsize / 4;
5469 }
5470 else {
5471 assert(PyUnicode_Check(rep));
5472 if (PyUnicode_READY(rep) < 0)
5473 goto error;
5474 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5475 if (!PyUnicode_IS_ASCII(rep)) {
5476 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 "surrogates not allowed");
5479 goto error;
5480 }
5481 }
5482
5483 /* four bytes are reserved for each surrogate */
5484 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005485 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005486 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 /* integer overflow */
5488 PyErr_NoMemory();
5489 goto error;
5490 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005491 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005492 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005493 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 }
5495
5496 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005497 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5502 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 }
5504
5505 Py_CLEAR(rep);
5506 }
5507
5508 /* Cut back to size actually needed. This is necessary for, for example,
5509 encoding of a string containing isolated surrogates and the 'ignore'
5510 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005511 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 if (nsize != PyBytes_GET_SIZE(v))
5513 _PyBytes_Resize(&v, nsize);
5514 Py_XDECREF(errorHandler);
5515 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005516 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005517 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005518 error:
5519 Py_XDECREF(rep);
5520 Py_XDECREF(errorHandler);
5521 Py_XDECREF(exc);
5522 Py_XDECREF(v);
5523 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524}
5525
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005527PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5528 Py_ssize_t size,
5529 const char *errors,
5530 int byteorder)
5531{
5532 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005533 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005534 if (tmp == NULL)
5535 return NULL;
5536 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5537 Py_DECREF(tmp);
5538 return result;
5539}
5540
5541PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005542PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005543{
Victor Stinnerb960b342011-11-20 19:12:52 +01005544 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005545}
5546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547/* --- UTF-16 Codec ------------------------------------------------------- */
5548
Tim Peters772747b2001-08-09 22:21:55 +00005549PyObject *
5550PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 Py_ssize_t size,
5552 const char *errors,
5553 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554{
Walter Dörwald69652032004-09-07 20:24:22 +00005555 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5556}
5557
5558PyObject *
5559PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 Py_ssize_t size,
5561 const char *errors,
5562 int *byteorder,
5563 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005566 Py_ssize_t startinpos;
5567 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005568 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005569 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005570 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005571 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005572 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 PyObject *errorHandler = NULL;
5574 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005575 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Tim Peters772747b2001-08-09 22:21:55 +00005577 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
5580 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005581 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005583 /* Check for BOM marks (U+FEFF) in the input and adjust current
5584 byte order setting accordingly. In native mode, the leading BOM
5585 mark is skipped, in all other modes, it is copied to the output
5586 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 if (bo == 0 && size >= 2) {
5588 const Py_UCS4 bom = (q[1] << 8) | q[0];
5589 if (bom == 0xFEFF) {
5590 q += 2;
5591 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 else if (bom == 0xFFFE) {
5594 q += 2;
5595 bo = 1;
5596 }
5597 if (byteorder)
5598 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 if (q == e) {
5602 if (consumed)
5603 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005604 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005605 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Christian Heimes743e0cd2012-10-17 23:52:17 +02005607#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005609 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005610#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005613#endif
Tim Peters772747b2001-08-09 22:21:55 +00005614
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 /* Note: size will always be longer than the resulting Unicode
5616 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005617 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005618 writer.min_length = (e - q + 1) / 2;
5619 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005620 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005621
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 while (1) {
5623 Py_UCS4 ch = 0;
5624 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 native_ordering);
5631 else
5632 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 } else if (kind == PyUnicode_2BYTE_KIND) {
5636 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 } else {
5640 assert(kind == PyUnicode_4BYTE_KIND);
5641 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005644 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005645 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 switch (ch)
5648 {
5649 case 0:
5650 /* remaining byte at the end? (size should be even) */
5651 if (q == e || consumed)
5652 goto End;
5653 errmsg = "truncated data";
5654 startinpos = ((const char *)q) - starts;
5655 endinpos = ((const char *)e) - starts;
5656 break;
5657 /* The remaining input chars are ignored if the callback
5658 chooses to skip the input */
5659 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005660 q -= 2;
5661 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005662 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005663 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005664 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005665 endinpos = ((const char *)e) - starts;
5666 break;
5667 case 2:
5668 errmsg = "illegal encoding";
5669 startinpos = ((const char *)q) - 2 - starts;
5670 endinpos = startinpos + 2;
5671 break;
5672 case 3:
5673 errmsg = "illegal UTF-16 surrogate";
5674 startinpos = ((const char *)q) - 4 - starts;
5675 endinpos = startinpos + 2;
5676 break;
5677 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005678 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005679 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 continue;
5681 }
5682
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005683 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005684 errors,
5685 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005686 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005687 &starts,
5688 (const char **)&e,
5689 &startinpos,
5690 &endinpos,
5691 &exc,
5692 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 }
5696
Antoine Pitrou63065d72012-05-15 23:48:04 +02005697End:
Walter Dörwald69652032004-09-07 20:24:22 +00005698 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_XDECREF(errorHandler);
5702 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005706 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 Py_XDECREF(errorHandler);
5708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 return NULL;
5710}
5711
Tim Peters772747b2001-08-09 22:21:55 +00005712PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713_PyUnicode_EncodeUTF16(PyObject *str,
5714 const char *errors,
5715 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 enum PyUnicode_Kind kind;
5718 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005720 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005723#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005724 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005725#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005727#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 const char *encoding;
5729 Py_ssize_t nsize, pos;
5730 PyObject *errorHandler = NULL;
5731 PyObject *exc = NULL;
5732 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005733
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 if (!PyUnicode_Check(str)) {
5735 PyErr_BadArgument();
5736 return NULL;
5737 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005738 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005739 return NULL;
5740 kind = PyUnicode_KIND(str);
5741 data = PyUnicode_DATA(str);
5742 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005743
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005745 if (kind == PyUnicode_4BYTE_KIND) {
5746 const Py_UCS4 *in = (const Py_UCS4 *)data;
5747 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 while (in < end) {
5749 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 }
5752 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005753 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005757 nsize = len + pairs + (byteorder == 0);
5758 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005764 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005765 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005767 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005770 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
Tim Peters772747b2001-08-09 22:21:55 +00005772
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 if (kind == PyUnicode_1BYTE_KIND) {
5774 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5775 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005776 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005777
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005778 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005780 }
5781 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005782 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005783 }
5784 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005786 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787
5788 pos = 0;
5789 while (pos < len) {
5790 Py_ssize_t repsize, moreunits;
5791
5792 if (kind == PyUnicode_2BYTE_KIND) {
5793 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5794 &out, native_ordering);
5795 }
5796 else {
5797 assert(kind == PyUnicode_4BYTE_KIND);
5798 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5799 &out, native_ordering);
5800 }
5801 if (pos == len)
5802 break;
5803
5804 rep = unicode_encode_call_errorhandler(
5805 errors, &errorHandler,
5806 encoding, "surrogates not allowed",
5807 str, &exc, pos, pos + 1, &pos);
5808 if (!rep)
5809 goto error;
5810
5811 if (PyBytes_Check(rep)) {
5812 repsize = PyBytes_GET_SIZE(rep);
5813 if (repsize & 1) {
5814 raise_encode_exception(&exc, encoding,
5815 str, pos - 1, pos,
5816 "surrogates not allowed");
5817 goto error;
5818 }
5819 moreunits = repsize / 2;
5820 }
5821 else {
5822 assert(PyUnicode_Check(rep));
5823 if (PyUnicode_READY(rep) < 0)
5824 goto error;
5825 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5826 if (!PyUnicode_IS_ASCII(rep)) {
5827 raise_encode_exception(&exc, encoding,
5828 str, pos - 1, pos,
5829 "surrogates not allowed");
5830 goto error;
5831 }
5832 }
5833
5834 /* two bytes are reserved for each surrogate */
5835 if (moreunits > 1) {
5836 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005837 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 /* integer overflow */
5839 PyErr_NoMemory();
5840 goto error;
5841 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005842 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 goto error;
5844 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5845 }
5846
5847 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005848 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 out += moreunits;
5850 } else /* rep is unicode */ {
5851 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5852 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5853 &out, native_ordering);
5854 }
5855
5856 Py_CLEAR(rep);
5857 }
5858
5859 /* Cut back to size actually needed. This is necessary for, for example,
5860 encoding of a string containing isolated surrogates and the 'ignore' handler
5861 is used. */
5862 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5863 if (nsize != PyBytes_GET_SIZE(v))
5864 _PyBytes_Resize(&v, nsize);
5865 Py_XDECREF(errorHandler);
5866 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005867 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005869 error:
5870 Py_XDECREF(rep);
5871 Py_XDECREF(errorHandler);
5872 Py_XDECREF(exc);
5873 Py_XDECREF(v);
5874 return NULL;
5875#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876}
5877
Alexander Belopolsky40018472011-02-26 01:02:56 +00005878PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5880 Py_ssize_t size,
5881 const char *errors,
5882 int byteorder)
5883{
5884 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005885 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 if (tmp == NULL)
5887 return NULL;
5888 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5889 Py_DECREF(tmp);
5890 return result;
5891}
5892
5893PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005894PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
5899/* --- Unicode Escape Codec ----------------------------------------------- */
5900
Fredrik Lundh06d12682001-01-24 07:59:11 +00005901static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005902
Alexander Belopolsky40018472011-02-26 01:02:56 +00005903PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005904_PyUnicode_DecodeUnicodeEscape(const char *s,
5905 Py_ssize_t size,
5906 const char *errors,
5907 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005910 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 PyObject *errorHandler = NULL;
5913 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005914
Eric V. Smith42454af2016-10-31 09:22:08 -04005915 // so we can remember if we've seen an invalid escape char or not
5916 *first_invalid_escape = NULL;
5917
Victor Stinner62ec3312016-09-06 17:04:34 -07005918 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005919 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005920 }
5921 /* Escaped strings will always be longer than the resulting
5922 Unicode string, so we start with size here and then reduce the
5923 length after conversion to the true value.
5924 (but if the error callback returns a long replacement string
5925 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005926 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005927 writer.min_length = size;
5928 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5929 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930 }
5931
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 end = s + size;
5933 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005934 unsigned char c = (unsigned char) *s++;
5935 Py_UCS4 ch;
5936 int count;
5937 Py_ssize_t startinpos;
5938 Py_ssize_t endinpos;
5939 const char *message;
5940
5941#define WRITE_ASCII_CHAR(ch) \
5942 do { \
5943 assert(ch <= 127); \
5944 assert(writer.pos < writer.size); \
5945 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5946 } while(0)
5947
5948#define WRITE_CHAR(ch) \
5949 do { \
5950 if (ch <= writer.maxchar) { \
5951 assert(writer.pos < writer.size); \
5952 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5953 } \
5954 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5955 goto onError; \
5956 } \
5957 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
5959 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 if (c != '\\') {
5961 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 continue;
5963 }
5964
Victor Stinner62ec3312016-09-06 17:04:34 -07005965 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005967 if (s >= end) {
5968 message = "\\ at end of string";
5969 goto error;
5970 }
5971 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005972
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005974 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 case '\n': continue;
5978 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5979 case '\'': WRITE_ASCII_CHAR('\''); continue;
5980 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5981 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5984 case 't': WRITE_ASCII_CHAR('\t'); continue;
5985 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5986 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005987 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case '0': case '1': case '2': case '3':
5994 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005996 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005997 ch = (ch<<3) + *s++ - '0';
5998 if (s < end && '0' <= *s && *s <= '7') {
5999 ch = (ch<<3) + *s++ - '0';
6000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006002 WRITE_CHAR(ch);
6003 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* hex escapes */
6006 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006008 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006009 message = "truncated \\xXX escape";
6010 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006014 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006015 message = "truncated \\uXXXX escape";
6016 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006019 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006021 message = "truncated \\UXXXXXXXX escape";
6022 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006023 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006024 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006025 ch <<= 4;
6026 if (c >= '0' && c <= '9') {
6027 ch += c - '0';
6028 }
6029 else if (c >= 'a' && c <= 'f') {
6030 ch += c - ('a' - 10);
6031 }
6032 else if (c >= 'A' && c <= 'F') {
6033 ch += c - ('A' - 10);
6034 }
6035 else {
6036 break;
6037 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006038 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006039 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006040 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006041 }
6042
6043 /* when we get here, ch is a 32-bit unicode character */
6044 if (ch > MAX_UNICODE) {
6045 message = "illegal Unicode character";
6046 goto error;
6047 }
6048
6049 WRITE_CHAR(ch);
6050 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006053 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 if (ucnhash_CAPI == NULL) {
6055 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006056 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6057 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006058 if (ucnhash_CAPI == NULL) {
6059 PyErr_SetString(
6060 PyExc_UnicodeError,
6061 "\\N escapes not supported (can't load unicodedata module)"
6062 );
6063 goto onError;
6064 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006065 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006066
6067 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 const char *start = ++s;
6070 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 namelen = s - start;
6075 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 ch = 0xffffffff; /* in case 'getcode' messes up */
6079 if (namelen <= INT_MAX &&
6080 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6081 &ch, 0)) {
6082 assert(ch <= MAX_UNICODE);
6083 WRITE_CHAR(ch);
6084 continue;
6085 }
6086 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006087 }
6088 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006089 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006090
6091 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006092 if (*first_invalid_escape == NULL) {
6093 *first_invalid_escape = s-1; /* Back up one char, since we've
6094 already incremented s. */
6095 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 WRITE_ASCII_CHAR('\\');
6097 WRITE_CHAR(c);
6098 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006100
6101 error:
6102 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006103 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006104 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105 errors, &errorHandler,
6106 "unicodeescape", message,
6107 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006109 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006110 }
6111 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6112 goto onError;
6113 }
6114
6115#undef WRITE_ASCII_CHAR
6116#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006118
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006121 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006124 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 return NULL;
6128}
6129
Eric V. Smith42454af2016-10-31 09:22:08 -04006130PyObject *
6131PyUnicode_DecodeUnicodeEscape(const char *s,
6132 Py_ssize_t size,
6133 const char *errors)
6134{
6135 const char *first_invalid_escape;
6136 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6137 &first_invalid_escape);
6138 if (result == NULL)
6139 return NULL;
6140 if (first_invalid_escape != NULL) {
6141 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6142 "invalid escape sequence '\\%c'",
6143 *first_invalid_escape) < 0) {
6144 Py_DECREF(result);
6145 return NULL;
6146 }
6147 }
6148 return result;
6149}
6150
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006151/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Ezio Melottie7f90372012-10-05 03:33:31 +03006163 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006164 escape.
6165
Ezio Melottie7f90372012-10-05 03:33:31 +03006166 For UCS1 strings it's '\xxx', 4 bytes per source character.
6167 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6168 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006169 */
6170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 if (!PyUnicode_Check(unicode)) {
6172 PyErr_BadArgument();
6173 return NULL;
6174 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 }
Victor Stinner358af132015-10-12 22:36:57 +02006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 if (len == 0) {
6181 return PyBytes_FromStringAndSize(NULL, 0);
6182 }
6183
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 kind = PyUnicode_KIND(unicode);
6185 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6187 bytes, and 1 byte characters 4. */
6188 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006189 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 return PyErr_NoMemory();
6191 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006192 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 if (repr == NULL) {
6194 return NULL;
6195 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006199 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 /* U+0000-U+00ff range */
6202 if (ch < 0x100) {
6203 if (ch >= ' ' && ch < 127) {
6204 if (ch != '\\') {
6205 /* Copy printable US ASCII as-is */
6206 *p++ = (char) ch;
6207 }
6208 /* Escape backslashes */
6209 else {
6210 *p++ = '\\';
6211 *p++ = '\\';
6212 }
6213 }
Victor Stinner358af132015-10-12 22:36:57 +02006214
Victor Stinner62ec3312016-09-06 17:04:34 -07006215 /* Map special whitespace to '\t', \n', '\r' */
6216 else if (ch == '\t') {
6217 *p++ = '\\';
6218 *p++ = 't';
6219 }
6220 else if (ch == '\n') {
6221 *p++ = '\\';
6222 *p++ = 'n';
6223 }
6224 else if (ch == '\r') {
6225 *p++ = '\\';
6226 *p++ = 'r';
6227 }
6228
6229 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6230 else {
6231 *p++ = '\\';
6232 *p++ = 'x';
6233 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6234 *p++ = Py_hexdigits[ch & 0x000F];
6235 }
Tim Petersced69f82003-09-16 20:30:58 +00006236 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006237 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 *p++ = '\\';
6240 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006241 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6242 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6243 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6244 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6247 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006248
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 /* Make sure that the first two digits are zero */
6250 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006251 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 *p++ = 'U';
6253 *p++ = '0';
6254 *p++ = '0';
6255 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6260 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 assert(p - PyBytes_AS_STRING(repr) > 0);
6265 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6266 return NULL;
6267 }
6268 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269}
6270
Alexander Belopolsky40018472011-02-26 01:02:56 +00006271PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6273 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006275 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006276 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 }
6280
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 result = PyUnicode_AsUnicodeEscapeString(tmp);
6282 Py_DECREF(tmp);
6283 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284}
6285
6286/* --- Raw Unicode Escape Codec ------------------------------------------- */
6287
Alexander Belopolsky40018472011-02-26 01:02:56 +00006288PyObject *
6289PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006290 Py_ssize_t size,
6291 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006294 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 PyObject *errorHandler = NULL;
6297 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006298
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006300 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006302
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 /* Escaped strings will always be longer than the resulting
6304 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305 length after conversion to the true value. (But decoding error
6306 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006307 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 writer.min_length = size;
6309 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6310 goto onError;
6311 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 end = s + size;
6314 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 unsigned char c = (unsigned char) *s++;
6316 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006317 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 Py_ssize_t startinpos;
6319 Py_ssize_t endinpos;
6320 const char *message;
6321
6322#define WRITE_CHAR(ch) \
6323 do { \
6324 if (ch <= writer.maxchar) { \
6325 assert(writer.pos < writer.size); \
6326 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6327 } \
6328 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6329 goto onError; \
6330 } \
6331 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 if (c != '\\' || s >= end) {
6335 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006338
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 c = (unsigned char) *s++;
6340 if (c == 'u') {
6341 count = 4;
6342 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 else if (c == 'U') {
6345 count = 8;
6346 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006347 }
6348 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 assert(writer.pos < writer.size);
6350 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6351 WRITE_CHAR(c);
6352 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006353 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 startinpos = s - starts - 2;
6355
6356 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6357 for (ch = 0; count && s < end; ++s, --count) {
6358 c = (unsigned char)*s;
6359 ch <<= 4;
6360 if (c >= '0' && c <= '9') {
6361 ch += c - '0';
6362 }
6363 else if (c >= 'a' && c <= 'f') {
6364 ch += c - ('a' - 10);
6365 }
6366 else if (c >= 'A' && c <= 'F') {
6367 ch += c - ('A' - 10);
6368 }
6369 else {
6370 break;
6371 }
6372 }
6373 if (!count) {
6374 if (ch <= MAX_UNICODE) {
6375 WRITE_CHAR(ch);
6376 continue;
6377 }
6378 message = "\\Uxxxxxxxx out of range";
6379 }
6380
6381 endinpos = s-starts;
6382 writer.min_length = end - s + writer.pos;
6383 if (unicode_decode_call_errorhandler_writer(
6384 errors, &errorHandler,
6385 "rawunicodeescape", message,
6386 &starts, &end, &startinpos, &endinpos, &exc, &s,
6387 &writer)) {
6388 goto onError;
6389 }
6390 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6391 goto onError;
6392 }
6393
6394#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006401 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 int kind;
6416 void *data;
6417 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 if (!PyUnicode_Check(unicode)) {
6420 PyErr_BadArgument();
6421 return NULL;
6422 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 kind = PyUnicode_KIND(unicode);
6427 data = PyUnicode_DATA(unicode);
6428 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 if (kind == PyUnicode_1BYTE_KIND) {
6430 return PyBytes_FromStringAndSize(data, len);
6431 }
Victor Stinner0e368262011-11-10 20:12:49 +01006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6434 bytes, and 1 byte characters 4. */
6435 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 if (len > PY_SSIZE_T_MAX / expandsize) {
6438 return PyErr_NoMemory();
6439 }
6440 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6441 if (repr == NULL) {
6442 return NULL;
6443 }
6444 if (len == 0) {
6445 return repr;
6446 }
6447
6448 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 for (pos = 0; pos < len; pos++) {
6450 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006451
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6453 if (ch < 0x100) {
6454 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006455 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6457 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 *p++ = '\\';
6459 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6466 else {
6467 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6468 *p++ = '\\';
6469 *p++ = 'U';
6470 *p++ = '0';
6471 *p++ = '0';
6472 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6477 *p++ = Py_hexdigits[ch & 15];
6478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 assert(p > PyBytes_AS_STRING(repr));
6482 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6483 return NULL;
6484 }
6485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006493 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006495 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6497 Py_DECREF(tmp);
6498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501/* --- Unicode Internal Codec ------------------------------------------- */
6502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
6504_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006505 Py_ssize_t size,
6506 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507{
6508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006509 Py_ssize_t startinpos;
6510 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512 const char *end;
6513 const char *reason;
6514 PyObject *errorHandler = NULL;
6515 PyObject *exc = NULL;
6516
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006518 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 1))
6520 return NULL;
6521
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006522 if (size < 0) {
6523 PyErr_BadInternalCall();
6524 return NULL;
6525 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006526 if (size == 0)
6527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
6530 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6531 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 }
6534 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535
Victor Stinner8f674cc2013-04-17 23:02:17 +02006536 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006538 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006539 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = end-starts;
6542 reason = "truncated input";
6543 goto error;
6544 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006557 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558 endinpos = s - starts + Py_UNICODE_SIZE;
6559 reason = "illegal code point (> 0x10FFFF)";
6560 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 s += Py_UNICODE_SIZE;
6564#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006565 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006567 Py_UNICODE uch2;
6568 ((char *) &uch2)[0] = s[0];
6569 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006570 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006571 {
Victor Stinner551ac952011-11-29 22:58:13 +01006572 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 }
6575 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576#endif
6577
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006578 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006580 continue;
6581
6582 error:
6583 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 errors, &errorHandler,
6586 "unicode_internal", reason,
6587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 }
6591
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006597 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
6600 return NULL;
6601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* --- Latin-1 Codec ------------------------------------------------------ */
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006611 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static void
6616make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 PyObject *unicode,
6619 Py_ssize_t startpos, Py_ssize_t endpos,
6620 const char *reason)
6621{
6622 if (*exceptionObject == NULL) {
6623 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006625 encoding, unicode, startpos, endpos, reason);
6626 }
6627 else {
6628 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6629 goto onError;
6630 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6631 goto onError;
6632 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6633 goto onError;
6634 return;
6635 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006636 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 }
6638}
6639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static void
6642raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006644 PyObject *unicode,
6645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 const char *reason)
6647{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006648 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006649 encoding, unicode, startpos, endpos, reason);
6650 if (*exceptionObject != NULL)
6651 PyCodec_StrictErrors(*exceptionObject);
6652}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653
6654/* error handling callback helper:
6655 build arguments, call the callback and check the arguments,
6656 put the result into newpos and return the replacement string, which
6657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static PyObject *
6659unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 PyObject **errorHandler,
6661 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 Py_ssize_t startpos, Py_ssize_t endpos,
6664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006666 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 PyObject *restuple;
6669 PyObject *resunicode;
6670
6671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 }
6676
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return NULL;
6679 len = PyUnicode_GET_LENGTH(unicode);
6680
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006681 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006686 restuple = PyObject_CallFunctionObjArgs(
6687 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 &resunicode, newpos)) {
6697 Py_DECREF(restuple);
6698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6701 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6702 Py_DECREF(restuple);
6703 return NULL;
6704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 *newpos = len + *newpos;
6707 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 Py_INCREF(resunicode);
6713 Py_DECREF(restuple);
6714 return resunicode;
6715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006719 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* input state */
6723 Py_ssize_t pos=0, size;
6724 int kind;
6725 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 /* pointer into the output */
6727 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006728 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6729 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006730 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006732 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006733 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006734 /* output object */
6735 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Petersonbac79492012-01-14 13:34:47 -05006737 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 return NULL;
6739 size = PyUnicode_GET_LENGTH(unicode);
6740 kind = PyUnicode_KIND(unicode);
6741 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 /* allocate enough for a simple encoding without
6743 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006744 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006745 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746
6747 _PyBytesWriter_Init(&writer);
6748 str = _PyBytesWriter_Alloc(&writer, size);
6749 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006756 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006758 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006762 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006765 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006767
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006768 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006771 /* Only overallocate the buffer if it's not the last write */
6772 writer.overallocate = (collend < size);
6773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006775 if (error_handler == _Py_ERROR_UNKNOWN)
6776 error_handler = get_error_handler(errors);
6777
6778 switch (error_handler) {
6779 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006780 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006782
6783 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006784 memset(str, '?', collend - collstart);
6785 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006786 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006787 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006792 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006793 writer.min_size -= (collend - collstart);
6794 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796 if (str == NULL)
6797 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 pos = collend;
6799 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006802 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006803 writer.min_size -= (collend - collstart);
6804 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 unicode, collstart, collend);
6806 if (str == NULL)
6807 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 break;
Victor Stinner50149202015-09-22 00:26:54 +02006810
Victor Stinnerc3713e92015-09-29 12:32:13 +02006811 case _Py_ERROR_SURROGATEESCAPE:
6812 for (i = collstart; i < collend; ++i) {
6813 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 0xdc80 || 0xdcff < ch) {
6815 /* Not a UTF-8b surrogate */
6816 break;
6817 }
6818 *str++ = (char)(ch - 0xdc00);
6819 ++pos;
6820 }
6821 if (i >= collend)
6822 break;
6823 collstart = pos;
6824 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006825 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6829 encoding, reason, unicode, &exc,
6830 collstart, collend, &newpos);
6831 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006833
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006835 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006836
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006838 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006839 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 PyBytes_AS_STRING(rep),
6841 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006842 if (str == NULL)
6843 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006844 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 else {
6846 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006847
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006851 if (limit == 256 ?
6852 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6853 !PyUnicode_IS_ASCII(rep))
6854 {
6855 /* Not all characters are smaller than limit */
6856 raise_encode_exception(&exc, encoding, unicode,
6857 collstart, collend, reason);
6858 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006860 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6861 str = _PyBytesWriter_WriteBytes(&writer, str,
6862 PyUnicode_DATA(rep),
6863 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006866 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006868
6869 /* If overallocation was disabled, ensure that it was the last
6870 write. Otherwise, we missed an optimization */
6871 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 }
6873 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
Victor Stinner50149202015-09-22 00:26:54 +02006875 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878
6879 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006880 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006881 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006882 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006883 Py_XDECREF(exc);
6884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885}
6886
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888PyObject *
6889PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006890 Py_ssize_t size,
6891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006894 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895 if (unicode == NULL)
6896 return NULL;
6897 result = unicode_encode_ucs1(unicode, errors, 256);
6898 Py_DECREF(unicode);
6899 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006903_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
6905 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyErr_BadArgument();
6907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006909 if (PyUnicode_READY(unicode) == -1)
6910 return NULL;
6911 /* Fast path: if it is a one-byte string, construct
6912 bytes object directly. */
6913 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6914 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6915 PyUnicode_GET_LENGTH(unicode));
6916 /* Non-Latin-1 characters present. Defer to above function to
6917 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006919}
6920
6921PyObject*
6922PyUnicode_AsLatin1String(PyObject *unicode)
6923{
6924 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
6927/* --- 7-bit ASCII Codec -------------------------------------------------- */
6928
Alexander Belopolsky40018472011-02-26 01:02:56 +00006929PyObject *
6930PyUnicode_DecodeASCII(const char *s,
6931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006935 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006936 int kind;
6937 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006938 Py_ssize_t startinpos;
6939 Py_ssize_t endinpos;
6940 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006942 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006950 if (size == 1 && (unsigned char)s[0] < 128)
6951 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952
Victor Stinner8f674cc2013-04-17 23:02:17 +02006953 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006954 writer.min_length = size;
6955 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006956 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 writer.pos = outpos;
6962 if (writer.pos == size)
6963 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006964
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 s += writer.pos;
6966 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006968 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 PyUnicode_WRITE(kind, data, writer.pos, c);
6971 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975
6976 /* byte outsize range 0x00..0x7f: call the error handler */
6977
6978 if (error_handler == _Py_ERROR_UNKNOWN)
6979 error_handler = get_error_handler(errors);
6980
6981 switch (error_handler)
6982 {
6983 case _Py_ERROR_REPLACE:
6984 case _Py_ERROR_SURROGATEESCAPE:
6985 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006986 but we may switch to UCS2 at the first write */
6987 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6988 goto onError;
6989 kind = writer.kind;
6990 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991
6992 if (error_handler == _Py_ERROR_REPLACE)
6993 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6994 else
6995 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6996 writer.pos++;
6997 ++s;
6998 break;
6999
7000 case _Py_ERROR_IGNORE:
7001 ++s;
7002 break;
7003
7004 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 startinpos = s-starts;
7006 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007008 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 "ascii", "ordinal not in range(128)",
7010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007011 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 kind = writer.kind;
7014 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007017 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007022 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 return NULL;
7026}
7027
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
7030PyUnicode_EncodeASCII(const Py_UNICODE *p,
7031 Py_ssize_t size,
7032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007035 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 if (unicode == NULL)
7037 return NULL;
7038 result = unicode_encode_ucs1(unicode, errors, 128);
7039 Py_DECREF(unicode);
7040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
Alexander Belopolsky40018472011-02-26 01:02:56 +00007043PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
7046 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_BadArgument();
7048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 if (PyUnicode_READY(unicode) == -1)
7051 return NULL;
7052 /* Fast path: if it is an ASCII-only string, construct bytes object
7053 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007054 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7056 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058}
7059
7060PyObject *
7061PyUnicode_AsASCIIString(PyObject *unicode)
7062{
7063 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Steve Dowercc16be82016-09-08 10:35:16 -07007066#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007068/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007069
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007070#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#define NEED_RETRY
7072#endif
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074#ifndef WC_ERR_INVALID_CHARS
7075# define WC_ERR_INVALID_CHARS 0x0080
7076#endif
7077
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007078static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007079code_page_name(UINT code_page, PyObject **obj)
7080{
7081 *obj = NULL;
7082 if (code_page == CP_ACP)
7083 return "mbcs";
7084 if (code_page == CP_UTF7)
7085 return "CP_UTF7";
7086 if (code_page == CP_UTF8)
7087 return "CP_UTF8";
7088
7089 *obj = PyBytes_FromFormat("cp%u", code_page);
7090 if (*obj == NULL)
7091 return NULL;
7092 return PyBytes_AS_STRING(*obj);
7093}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
Victor Stinner3a50e702011-10-18 21:21:00 +02007095static DWORD
7096decode_code_page_flags(UINT code_page)
7097{
7098 if (code_page == CP_UTF7) {
7099 /* The CP_UTF7 decoder only supports flags=0 */
7100 return 0;
7101 }
7102 else
7103 return MB_ERR_INVALID_CHARS;
7104}
7105
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 * Decode a byte string from a Windows code page into unicode object in strict
7108 * mode.
7109 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007110 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7111 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007114decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 const char *in,
7117 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118{
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007120 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
7123 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 assert(insize > 0);
7125 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7126 if (outsize <= 0)
7127 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128
7129 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007131 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007132 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 if (*v == NULL)
7134 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 }
7137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007140 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 }
7144
7145 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7147 if (outsize <= 0)
7148 goto error;
7149 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007150
Victor Stinner3a50e702011-10-18 21:21:00 +02007151error:
7152 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7153 return -2;
7154 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007155 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156}
7157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158/*
7159 * Decode a byte string from a code page into unicode object with an error
7160 * handler.
7161 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007162 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 * UnicodeDecodeError exception and returns -1 on error.
7164 */
7165static int
7166decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007167 PyObject **v,
7168 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007169 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007170{
7171 const char *startin = in;
7172 const char *endin = in + size;
7173 const DWORD flags = decode_code_page_flags(code_page);
7174 /* Ideally, we should get reason from FormatMessage. This is the Windows
7175 2000 English version of the message. */
7176 const char *reason = "No mapping for the Unicode character exists "
7177 "in the target code page.";
7178 /* each step cannot decode more than 1 character, but a character can be
7179 represented as a surrogate pair */
7180 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 int insize;
7182 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 PyObject *errorHandler = NULL;
7184 PyObject *exc = NULL;
7185 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007186 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 DWORD err;
7188 int ret = -1;
7189
7190 assert(size > 0);
7191
7192 encoding = code_page_name(code_page, &encoding_obj);
7193 if (encoding == NULL)
7194 return -1;
7195
Victor Stinner7d00cc12014-03-17 23:08:06 +01007196 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7198 UnicodeDecodeError. */
7199 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7200 if (exc != NULL) {
7201 PyCodec_StrictErrors(exc);
7202 Py_CLEAR(exc);
7203 }
7204 goto error;
7205 }
7206
7207 if (*v == NULL) {
7208 /* Create unicode object */
7209 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7210 PyErr_NoMemory();
7211 goto error;
7212 }
Victor Stinnerab595942011-12-17 04:59:06 +01007213 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 if (*v == NULL)
7216 goto error;
7217 startout = PyUnicode_AS_UNICODE(*v);
7218 }
7219 else {
7220 /* Extend unicode object */
7221 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7222 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7223 PyErr_NoMemory();
7224 goto error;
7225 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007226 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 goto error;
7228 startout = PyUnicode_AS_UNICODE(*v) + n;
7229 }
7230
7231 /* Decode the byte string character per character */
7232 out = startout;
7233 while (in < endin)
7234 {
7235 /* Decode a character */
7236 insize = 1;
7237 do
7238 {
7239 outsize = MultiByteToWideChar(code_page, flags,
7240 in, insize,
7241 buffer, Py_ARRAY_LENGTH(buffer));
7242 if (outsize > 0)
7243 break;
7244 err = GetLastError();
7245 if (err != ERROR_NO_UNICODE_TRANSLATION
7246 && err != ERROR_INSUFFICIENT_BUFFER)
7247 {
7248 PyErr_SetFromWindowsErr(0);
7249 goto error;
7250 }
7251 insize++;
7252 }
7253 /* 4=maximum length of a UTF-8 sequence */
7254 while (insize <= 4 && (in + insize) <= endin);
7255
7256 if (outsize <= 0) {
7257 Py_ssize_t startinpos, endinpos, outpos;
7258
Victor Stinner7d00cc12014-03-17 23:08:06 +01007259 /* last character in partial decode? */
7260 if (in + insize >= endin && !final)
7261 break;
7262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 startinpos = in - startin;
7264 endinpos = startinpos + 1;
7265 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007266 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 errors, &errorHandler,
7268 encoding, reason,
7269 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 {
7272 goto error;
7273 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007274 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 }
7276 else {
7277 in += insize;
7278 memcpy(out, buffer, outsize * sizeof(wchar_t));
7279 out += outsize;
7280 }
7281 }
7282
7283 /* write a NUL character at the end */
7284 *out = 0;
7285
7286 /* Extend unicode object */
7287 outsize = out - startout;
7288 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007289 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007291 /* (in - startin) <= size and size is an int */
7292 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007293
7294error:
7295 Py_XDECREF(encoding_obj);
7296 Py_XDECREF(errorHandler);
7297 Py_XDECREF(exc);
7298 return ret;
7299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301static PyObject *
7302decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 const char *s, Py_ssize_t size,
7304 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305{
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 PyObject *v = NULL;
7307 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (code_page < 0) {
7310 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7311 return NULL;
7312 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007313 if (size < 0) {
7314 PyErr_BadInternalCall();
7315 return NULL;
7316 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007317
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 do
7322 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 if (size > INT_MAX) {
7325 chunk_size = INT_MAX;
7326 final = 0;
7327 done = 0;
7328 }
7329 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 {
7332 chunk_size = (int)size;
7333 final = (consumed == NULL);
7334 done = 1;
7335 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 if (chunk_size == 0 && done) {
7338 if (v != NULL)
7339 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007340 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 converted = decode_code_page_strict(code_page, &v,
7344 s, chunk_size);
7345 if (converted == -2)
7346 converted = decode_code_page_errors(code_page, &v,
7347 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007348 errors, final);
7349 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007350
7351 if (converted < 0) {
7352 Py_XDECREF(v);
7353 return NULL;
7354 }
7355
7356 if (consumed)
7357 *consumed += converted;
7358
7359 s += converted;
7360 size -= converted;
7361 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007363 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364}
7365
Alexander Belopolsky40018472011-02-26 01:02:56 +00007366PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007367PyUnicode_DecodeCodePageStateful(int code_page,
7368 const char *s,
7369 Py_ssize_t size,
7370 const char *errors,
7371 Py_ssize_t *consumed)
7372{
7373 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7374}
7375
7376PyObject *
7377PyUnicode_DecodeMBCSStateful(const char *s,
7378 Py_ssize_t size,
7379 const char *errors,
7380 Py_ssize_t *consumed)
7381{
7382 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7383}
7384
7385PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007386PyUnicode_DecodeMBCS(const char *s,
7387 Py_ssize_t size,
7388 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007389{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7391}
7392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393static DWORD
7394encode_code_page_flags(UINT code_page, const char *errors)
7395{
7396 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007397 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 }
7399 else if (code_page == CP_UTF7) {
7400 /* CP_UTF7 only supports flags=0 */
7401 return 0;
7402 }
7403 else {
7404 if (errors != NULL && strcmp(errors, "replace") == 0)
7405 return 0;
7406 else
7407 return WC_NO_BEST_FIT_CHARS;
7408 }
7409}
7410
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 * Encode a Unicode string to a Windows code page into a byte string in strict
7413 * mode.
7414 *
7415 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007416 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007418static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007419encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422{
Victor Stinner554f3f02010-06-16 23:33:54 +00007423 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 BOOL *pusedDefaultChar = &usedDefaultChar;
7425 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007426 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const DWORD flags = encode_code_page_flags(code_page, NULL);
7429 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 /* Create a substring so that we can get the UTF-16 representation
7431 of just the slice under consideration. */
7432 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007437 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007439 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007440
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 substring = PyUnicode_Substring(unicode, offset, offset+len);
7442 if (substring == NULL)
7443 return -1;
7444 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7445 if (p == NULL) {
7446 Py_DECREF(substring);
7447 return -1;
7448 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007451 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007453 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 NULL, 0,
7455 NULL, pusedDefaultChar);
7456 if (outsize <= 0)
7457 goto error;
7458 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (pusedDefaultChar && *pusedDefaultChar) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007463
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 if (*outbytes == NULL) {
7468 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472 }
7473 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const Py_ssize_t n = PyBytes_Size(*outbytes);
7476 if (outsize > PY_SSIZE_T_MAX - n) {
7477 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007478 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7482 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 }
7487
7488 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007490 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 out, outsize,
7492 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 if (outsize <= 0)
7495 goto error;
7496 if (pusedDefaultChar && *pusedDefaultChar)
7497 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7503 return -2;
7504 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007505 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007506}
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007509 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 * error handler.
7511 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007512 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 * -1 on other error.
7514 */
7515static int
7516encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007517 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007518 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007519{
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007521 Py_ssize_t pos = unicode_offset;
7522 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 /* Ideally, we should get reason from FormatMessage. This is the Windows
7524 2000 English version of the message. */
7525 const char *reason = "invalid character";
7526 /* 4=maximum length of a UTF-8 sequence */
7527 char buffer[4];
7528 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7529 Py_ssize_t outsize;
7530 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 PyObject *errorHandler = NULL;
7532 PyObject *exc = NULL;
7533 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007534 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 PyObject *rep;
7537 int ret = -1;
7538
7539 assert(insize > 0);
7540
7541 encoding = code_page_name(code_page, &encoding_obj);
7542 if (encoding == NULL)
7543 return -1;
7544
7545 if (errors == NULL || strcmp(errors, "strict") == 0) {
7546 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7547 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007548 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 if (exc != NULL) {
7550 PyCodec_StrictErrors(exc);
7551 Py_DECREF(exc);
7552 }
7553 Py_XDECREF(encoding_obj);
7554 return -1;
7555 }
7556
7557 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7558 pusedDefaultChar = &usedDefaultChar;
7559 else
7560 pusedDefaultChar = NULL;
7561
7562 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7563 PyErr_NoMemory();
7564 goto error;
7565 }
7566 outsize = insize * Py_ARRAY_LENGTH(buffer);
7567
7568 if (*outbytes == NULL) {
7569 /* Create string object */
7570 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7571 if (*outbytes == NULL)
7572 goto error;
7573 out = PyBytes_AS_STRING(*outbytes);
7574 }
7575 else {
7576 /* Extend string object */
7577 Py_ssize_t n = PyBytes_Size(*outbytes);
7578 if (n > PY_SSIZE_T_MAX - outsize) {
7579 PyErr_NoMemory();
7580 goto error;
7581 }
7582 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7583 goto error;
7584 out = PyBytes_AS_STRING(*outbytes) + n;
7585 }
7586
7587 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7591 wchar_t chars[2];
7592 int charsize;
7593 if (ch < 0x10000) {
7594 chars[0] = (wchar_t)ch;
7595 charsize = 1;
7596 }
7597 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007598 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7599 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 charsize = 2;
7601 }
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 buffer, Py_ARRAY_LENGTH(buffer),
7606 NULL, pusedDefaultChar);
7607 if (outsize > 0) {
7608 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7609 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 memcpy(out, buffer, outsize);
7612 out += outsize;
7613 continue;
7614 }
7615 }
7616 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7617 PyErr_SetFromWindowsErr(0);
7618 goto error;
7619 }
7620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 rep = unicode_encode_call_errorhandler(
7622 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 if (rep == NULL)
7626 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007628
7629 if (PyBytes_Check(rep)) {
7630 outsize = PyBytes_GET_SIZE(rep);
7631 if (outsize != 1) {
7632 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7633 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7634 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7635 Py_DECREF(rep);
7636 goto error;
7637 }
7638 out = PyBytes_AS_STRING(*outbytes) + offset;
7639 }
7640 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7641 out += outsize;
7642 }
7643 else {
7644 Py_ssize_t i;
7645 enum PyUnicode_Kind kind;
7646 void *data;
7647
Benjamin Petersonbac79492012-01-14 13:34:47 -05007648 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 Py_DECREF(rep);
7650 goto error;
7651 }
7652
7653 outsize = PyUnicode_GET_LENGTH(rep);
7654 if (outsize != 1) {
7655 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7656 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7657 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7658 Py_DECREF(rep);
7659 goto error;
7660 }
7661 out = PyBytes_AS_STRING(*outbytes) + offset;
7662 }
7663 kind = PyUnicode_KIND(rep);
7664 data = PyUnicode_DATA(rep);
7665 for (i=0; i < outsize; i++) {
7666 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7667 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007668 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 encoding, unicode,
7670 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 "unable to encode error handler result to ASCII");
7672 Py_DECREF(rep);
7673 goto error;
7674 }
7675 *out = (unsigned char)ch;
7676 out++;
7677 }
7678 }
7679 Py_DECREF(rep);
7680 }
7681 /* write a NUL byte */
7682 *out = 0;
7683 outsize = out - PyBytes_AS_STRING(*outbytes);
7684 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7685 if (_PyBytes_Resize(outbytes, outsize) < 0)
7686 goto error;
7687 ret = 0;
7688
7689error:
7690 Py_XDECREF(encoding_obj);
7691 Py_XDECREF(errorHandler);
7692 Py_XDECREF(exc);
7693 return ret;
7694}
7695
Victor Stinner3a50e702011-10-18 21:21:00 +02007696static PyObject *
7697encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 const char *errors)
7700{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007705
Victor Stinner29dacf22015-01-26 16:41:32 +01007706 if (!PyUnicode_Check(unicode)) {
7707 PyErr_BadArgument();
7708 return NULL;
7709 }
7710
Benjamin Petersonbac79492012-01-14 13:34:47 -05007711 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 return NULL;
7713 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007714
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 if (code_page < 0) {
7716 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7717 return NULL;
7718 }
7719
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 return PyBytes_FromStringAndSize(NULL, 0);
7722
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 offset = 0;
7724 do
7725 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007727 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 chunks. */
7729 if (len > INT_MAX/2) {
7730 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 done = 0;
7732 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 done = 1;
7738 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007739
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 errors);
7743 if (ret == -2)
7744 ret = encode_code_page_errors(code_page, &outbytes,
7745 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 if (ret < 0) {
7748 Py_XDECREF(outbytes);
7749 return NULL;
7750 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007754 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 return outbytes;
7757}
7758
7759PyObject *
7760PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7761 Py_ssize_t size,
7762 const char *errors)
7763{
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007765 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007766 if (unicode == NULL)
7767 return NULL;
7768 res = encode_code_page(CP_ACP, unicode, errors);
7769 Py_DECREF(unicode);
7770 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007771}
7772
7773PyObject *
7774PyUnicode_EncodeCodePage(int code_page,
7775 PyObject *unicode,
7776 const char *errors)
7777{
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007779}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007780
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781PyObject *
7782PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007783{
Victor Stinner7581cef2011-11-03 22:32:33 +01007784 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007785}
7786
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787#undef NEED_RETRY
7788
Steve Dowercc16be82016-09-08 10:35:16 -07007789#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007790
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791/* --- Character Mapping Codec -------------------------------------------- */
7792
Victor Stinnerfb161b12013-04-18 01:44:27 +02007793static int
7794charmap_decode_string(const char *s,
7795 Py_ssize_t size,
7796 PyObject *mapping,
7797 const char *errors,
7798 _PyUnicodeWriter *writer)
7799{
7800 const char *starts = s;
7801 const char *e;
7802 Py_ssize_t startinpos, endinpos;
7803 PyObject *errorHandler = NULL, *exc = NULL;
7804 Py_ssize_t maplen;
7805 enum PyUnicode_Kind mapkind;
7806 void *mapdata;
7807 Py_UCS4 x;
7808 unsigned char ch;
7809
7810 if (PyUnicode_READY(mapping) == -1)
7811 return -1;
7812
7813 maplen = PyUnicode_GET_LENGTH(mapping);
7814 mapdata = PyUnicode_DATA(mapping);
7815 mapkind = PyUnicode_KIND(mapping);
7816
7817 e = s + size;
7818
7819 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7820 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7821 * is disabled in encoding aliases, latin1 is preferred because
7822 * its implementation is faster. */
7823 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7824 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7825 Py_UCS4 maxchar = writer->maxchar;
7826
7827 assert (writer->kind == PyUnicode_1BYTE_KIND);
7828 while (s < e) {
7829 ch = *s;
7830 x = mapdata_ucs1[ch];
7831 if (x > maxchar) {
7832 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7833 goto onError;
7834 maxchar = writer->maxchar;
7835 outdata = (Py_UCS1 *)writer->data;
7836 }
7837 outdata[writer->pos] = x;
7838 writer->pos++;
7839 ++s;
7840 }
7841 return 0;
7842 }
7843
7844 while (s < e) {
7845 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7846 enum PyUnicode_Kind outkind = writer->kind;
7847 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7848 if (outkind == PyUnicode_1BYTE_KIND) {
7849 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7850 Py_UCS4 maxchar = writer->maxchar;
7851 while (s < e) {
7852 ch = *s;
7853 x = mapdata_ucs2[ch];
7854 if (x > maxchar)
7855 goto Error;
7856 outdata[writer->pos] = x;
7857 writer->pos++;
7858 ++s;
7859 }
7860 break;
7861 }
7862 else if (outkind == PyUnicode_2BYTE_KIND) {
7863 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7864 while (s < e) {
7865 ch = *s;
7866 x = mapdata_ucs2[ch];
7867 if (x == 0xFFFE)
7868 goto Error;
7869 outdata[writer->pos] = x;
7870 writer->pos++;
7871 ++s;
7872 }
7873 break;
7874 }
7875 }
7876 ch = *s;
7877
7878 if (ch < maplen)
7879 x = PyUnicode_READ(mapkind, mapdata, ch);
7880 else
7881 x = 0xfffe; /* invalid value */
7882Error:
7883 if (x == 0xfffe)
7884 {
7885 /* undefined mapping */
7886 startinpos = s-starts;
7887 endinpos = startinpos+1;
7888 if (unicode_decode_call_errorhandler_writer(
7889 errors, &errorHandler,
7890 "charmap", "character maps to <undefined>",
7891 &starts, &e, &startinpos, &endinpos, &exc, &s,
7892 writer)) {
7893 goto onError;
7894 }
7895 continue;
7896 }
7897
7898 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7899 goto onError;
7900 ++s;
7901 }
7902 Py_XDECREF(errorHandler);
7903 Py_XDECREF(exc);
7904 return 0;
7905
7906onError:
7907 Py_XDECREF(errorHandler);
7908 Py_XDECREF(exc);
7909 return -1;
7910}
7911
7912static int
7913charmap_decode_mapping(const char *s,
7914 Py_ssize_t size,
7915 PyObject *mapping,
7916 const char *errors,
7917 _PyUnicodeWriter *writer)
7918{
7919 const char *starts = s;
7920 const char *e;
7921 Py_ssize_t startinpos, endinpos;
7922 PyObject *errorHandler = NULL, *exc = NULL;
7923 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007924 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007925
7926 e = s + size;
7927
7928 while (s < e) {
7929 ch = *s;
7930
7931 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7932 key = PyLong_FromLong((long)ch);
7933 if (key == NULL)
7934 goto onError;
7935
7936 item = PyObject_GetItem(mapping, key);
7937 Py_DECREF(key);
7938 if (item == NULL) {
7939 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7940 /* No mapping found means: mapping is undefined. */
7941 PyErr_Clear();
7942 goto Undefined;
7943 } else
7944 goto onError;
7945 }
7946
7947 /* Apply mapping */
7948 if (item == Py_None)
7949 goto Undefined;
7950 if (PyLong_Check(item)) {
7951 long value = PyLong_AS_LONG(item);
7952 if (value == 0xFFFE)
7953 goto Undefined;
7954 if (value < 0 || value > MAX_UNICODE) {
7955 PyErr_Format(PyExc_TypeError,
7956 "character mapping must be in range(0x%lx)",
7957 (unsigned long)MAX_UNICODE + 1);
7958 goto onError;
7959 }
7960
7961 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7962 goto onError;
7963 }
7964 else if (PyUnicode_Check(item)) {
7965 if (PyUnicode_READY(item) == -1)
7966 goto onError;
7967 if (PyUnicode_GET_LENGTH(item) == 1) {
7968 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7969 if (value == 0xFFFE)
7970 goto Undefined;
7971 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7972 goto onError;
7973 }
7974 else {
7975 writer->overallocate = 1;
7976 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7977 goto onError;
7978 }
7979 }
7980 else {
7981 /* wrong return value */
7982 PyErr_SetString(PyExc_TypeError,
7983 "character mapping must return integer, None or str");
7984 goto onError;
7985 }
7986 Py_CLEAR(item);
7987 ++s;
7988 continue;
7989
7990Undefined:
7991 /* undefined mapping */
7992 Py_CLEAR(item);
7993 startinpos = s-starts;
7994 endinpos = startinpos+1;
7995 if (unicode_decode_call_errorhandler_writer(
7996 errors, &errorHandler,
7997 "charmap", "character maps to <undefined>",
7998 &starts, &e, &startinpos, &endinpos, &exc, &s,
7999 writer)) {
8000 goto onError;
8001 }
8002 }
8003 Py_XDECREF(errorHandler);
8004 Py_XDECREF(exc);
8005 return 0;
8006
8007onError:
8008 Py_XDECREF(item);
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return -1;
8012}
8013
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014PyObject *
8015PyUnicode_DecodeCharmap(const char *s,
8016 Py_ssize_t size,
8017 PyObject *mapping,
8018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008020 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 /* Default to Latin-1 */
8023 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008027 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008028 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008029 writer.min_length = size;
8030 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008032
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008033 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008036 }
8037 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008038 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008041 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008042
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008044 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046}
8047
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048/* Charmap encoding: the lookup table */
8049
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 PyObject_HEAD
8052 unsigned char level1[32];
8053 int count2, count3;
8054 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055};
8056
8057static PyObject*
8058encoding_map_size(PyObject *obj, PyObject* args)
8059{
8060 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063}
8064
8065static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyDoc_STR("Return the size (in bytes) of this object") },
8068 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069};
8070
8071static void
8072encoding_map_dealloc(PyObject* o)
8073{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075}
8076
8077static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 "EncodingMap", /*tp_name*/
8080 sizeof(struct encoding_map), /*tp_basicsize*/
8081 0, /*tp_itemsize*/
8082 /* methods */
8083 encoding_map_dealloc, /*tp_dealloc*/
8084 0, /*tp_print*/
8085 0, /*tp_getattr*/
8086 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008087 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 0, /*tp_repr*/
8089 0, /*tp_as_number*/
8090 0, /*tp_as_sequence*/
8091 0, /*tp_as_mapping*/
8092 0, /*tp_hash*/
8093 0, /*tp_call*/
8094 0, /*tp_str*/
8095 0, /*tp_getattro*/
8096 0, /*tp_setattro*/
8097 0, /*tp_as_buffer*/
8098 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8099 0, /*tp_doc*/
8100 0, /*tp_traverse*/
8101 0, /*tp_clear*/
8102 0, /*tp_richcompare*/
8103 0, /*tp_weaklistoffset*/
8104 0, /*tp_iter*/
8105 0, /*tp_iternext*/
8106 encoding_map_methods, /*tp_methods*/
8107 0, /*tp_members*/
8108 0, /*tp_getset*/
8109 0, /*tp_base*/
8110 0, /*tp_dict*/
8111 0, /*tp_descr_get*/
8112 0, /*tp_descr_set*/
8113 0, /*tp_dictoffset*/
8114 0, /*tp_init*/
8115 0, /*tp_alloc*/
8116 0, /*tp_new*/
8117 0, /*tp_free*/
8118 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119};
8120
8121PyObject*
8122PyUnicode_BuildEncodingMap(PyObject* string)
8123{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 PyObject *result;
8125 struct encoding_map *mresult;
8126 int i;
8127 int need_dict = 0;
8128 unsigned char level1[32];
8129 unsigned char level2[512];
8130 unsigned char *mlevel1, *mlevel2, *mlevel3;
8131 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 int kind;
8133 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008137 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 PyErr_BadArgument();
8139 return NULL;
8140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 kind = PyUnicode_KIND(string);
8142 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 length = PyUnicode_GET_LENGTH(string);
8144 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 memset(level1, 0xFF, sizeof level1);
8146 memset(level2, 0xFF, sizeof level2);
8147
8148 /* If there isn't a one-to-one mapping of NULL to \0,
8149 or if there are non-BMP characters, we need to use
8150 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008153 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 ch = PyUnicode_READ(kind, data, i);
8156 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 need_dict = 1;
8158 break;
8159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 /* unmapped character */
8162 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 l1 = ch >> 11;
8164 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 if (level1[l1] == 0xFF)
8166 level1[l1] = count2++;
8167 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 }
8170
8171 if (count2 >= 0xFF || count3 >= 0xFF)
8172 need_dict = 1;
8173
8174 if (need_dict) {
8175 PyObject *result = PyDict_New();
8176 PyObject *key, *value;
8177 if (!result)
8178 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008181 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 if (!key || !value)
8183 goto failed1;
8184 if (PyDict_SetItem(result, key, value) == -1)
8185 goto failed1;
8186 Py_DECREF(key);
8187 Py_DECREF(value);
8188 }
8189 return result;
8190 failed1:
8191 Py_XDECREF(key);
8192 Py_XDECREF(value);
8193 Py_DECREF(result);
8194 return NULL;
8195 }
8196
8197 /* Create a three-level trie */
8198 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8199 16*count2 + 128*count3 - 1);
8200 if (!result)
8201 return PyErr_NoMemory();
8202 PyObject_Init(result, &EncodingMapType);
8203 mresult = (struct encoding_map*)result;
8204 mresult->count2 = count2;
8205 mresult->count3 = count3;
8206 mlevel1 = mresult->level1;
8207 mlevel2 = mresult->level23;
8208 mlevel3 = mresult->level23 + 16*count2;
8209 memcpy(mlevel1, level1, 32);
8210 memset(mlevel2, 0xFF, 16*count2);
8211 memset(mlevel3, 0, 128*count3);
8212 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008213 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8216 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 /* unmapped character */
8218 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008219 o1 = ch>>11;
8220 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i2 = 16*mlevel1[o1] + o2;
8222 if (mlevel2[i2] == 0xFF)
8223 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008224 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 i3 = 128*mlevel2[i2] + o3;
8226 mlevel3[i3] = i;
8227 }
8228 return result;
8229}
8230
8231static int
Victor Stinner22168992011-11-20 17:09:18 +01008232encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233{
8234 struct encoding_map *map = (struct encoding_map*)mapping;
8235 int l1 = c>>11;
8236 int l2 = (c>>7) & 0xF;
8237 int l3 = c & 0x7F;
8238 int i;
8239
Victor Stinner22168992011-11-20 17:09:18 +01008240 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 if (c == 0)
8243 return 0;
8244 /* level 1*/
8245 i = map->level1[l1];
8246 if (i == 0xFF) {
8247 return -1;
8248 }
8249 /* level 2*/
8250 i = map->level23[16*i+l2];
8251 if (i == 0xFF) {
8252 return -1;
8253 }
8254 /* level 3 */
8255 i = map->level23[16*map->count2 + 128*i + l3];
8256 if (i == 0) {
8257 return -1;
8258 }
8259 return i;
8260}
8261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262/* Lookup the character ch in the mapping. If the character
8263 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008264 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008266charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Christian Heimes217cfd12007-12-02 14:31:20 +00008268 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 PyObject *x;
8270
8271 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 x = PyObject_GetItem(mapping, w);
8274 Py_DECREF(w);
8275 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8277 /* No mapping found means: mapping is undefined. */
8278 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008279 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 } else
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008283 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008285 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 long value = PyLong_AS_LONG(x);
8287 if (value < 0 || value > 255) {
8288 PyErr_SetString(PyExc_TypeError,
8289 "character mapping must be in range(256)");
8290 Py_DECREF(x);
8291 return NULL;
8292 }
8293 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 /* wrong return value */
8299 PyErr_Format(PyExc_TypeError,
8300 "character mapping must return integer, bytes or None, not %.400s",
8301 x->ob_type->tp_name);
8302 Py_DECREF(x);
8303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305}
8306
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008308charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8311 /* exponentially overallocate to minimize reallocations */
8312 if (requiredsize < 2*outsize)
8313 requiredsize = 2*outsize;
8314 if (_PyBytes_Resize(outobj, requiredsize))
8315 return -1;
8316 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317}
8318
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008323 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 space is available. Return a new reference to the object that
8325 was put in the output buffer, or Py_None, if the mapping was undefined
8326 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008327 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008329charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 PyObject *rep;
8333 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008334 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Christian Heimes90aa7642007-12-19 02:45:37 +00008336 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 if (res == -1)
8340 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 if (outsize<requiredsize)
8342 if (charmapencode_resize(outobj, outpos, requiredsize))
8343 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 outstart[(*outpos)++] = (char)res;
8346 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 }
8348
8349 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 Py_DECREF(rep);
8354 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyLong_Check(rep)) {
8357 Py_ssize_t requiredsize = *outpos+1;
8358 if (outsize<requiredsize)
8359 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8360 Py_DECREF(rep);
8361 return enc_EXCEPTION;
8362 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008363 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 else {
8367 const char *repchars = PyBytes_AS_STRING(rep);
8368 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8369 Py_ssize_t requiredsize = *outpos+repsize;
8370 if (outsize<requiredsize)
8371 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8372 Py_DECREF(rep);
8373 return enc_EXCEPTION;
8374 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 memcpy(outstart + *outpos, repchars, repsize);
8377 *outpos += repsize;
8378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 Py_DECREF(rep);
8381 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382}
8383
8384/* handle an error in PyUnicode_EncodeCharmap
8385 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static int
8387charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008390 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008391 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392{
8393 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008396 enum PyUnicode_Kind kind;
8397 void *data;
8398 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t collstartpos = *inpos;
8401 Py_ssize_t collendpos = *inpos+1;
8402 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 char *encoding = "charmap";
8404 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008407 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408
Benjamin Petersonbac79492012-01-14 13:34:47 -05008409 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008410 return -1;
8411 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 /* find all unencodable characters */
8413 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008415 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008417 val = encoding_map_lookup(ch, mapping);
8418 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 break;
8420 ++collendpos;
8421 continue;
8422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008424 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8425 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 if (rep==NULL)
8427 return -1;
8428 else if (rep!=Py_None) {
8429 Py_DECREF(rep);
8430 break;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
8435 /* cache callback name lookup
8436 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008437 if (*error_handler == _Py_ERROR_UNKNOWN)
8438 *error_handler = get_error_handler(errors);
8439
8440 switch (*error_handler) {
8441 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008442 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008444
8445 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 x = charmapencode_output('?', mapping, res, respos);
8448 if (x==enc_EXCEPTION) {
8449 return -1;
8450 }
8451 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008452 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return -1;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
8456 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008457 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 *inpos = collendpos;
8459 break;
Victor Stinner50149202015-09-22 00:26:54 +02008460
8461 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 /* generate replacement (temporarily (mis)uses p) */
8463 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 char buffer[2+29+1+1];
8465 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 for (cp = buffer; *cp; ++cp) {
8468 x = charmapencode_output(*cp, mapping, res, respos);
8469 if (x==enc_EXCEPTION)
8470 return -1;
8471 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008472 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return -1;
8474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 }
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 *inpos = collendpos;
8478 break;
Victor Stinner50149202015-09-22 00:26:54 +02008479
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 default:
Victor Stinner50149202015-09-22 00:26:54 +02008481 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008482 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008486 if (PyBytes_Check(repunicode)) {
8487 /* Directly copy bytes result to output. */
8488 Py_ssize_t outsize = PyBytes_Size(*res);
8489 Py_ssize_t requiredsize;
8490 repsize = PyBytes_Size(repunicode);
8491 requiredsize = *respos + repsize;
8492 if (requiredsize > outsize)
8493 /* Make room for all additional bytes. */
8494 if (charmapencode_resize(res, respos, requiredsize)) {
8495 Py_DECREF(repunicode);
8496 return -1;
8497 }
8498 memcpy(PyBytes_AsString(*res) + *respos,
8499 PyBytes_AsString(repunicode), repsize);
8500 *respos += repsize;
8501 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008503 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008506 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 Py_DECREF(repunicode);
8508 return -1;
8509 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008510 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008511 data = PyUnicode_DATA(repunicode);
8512 kind = PyUnicode_KIND(repunicode);
8513 for (index = 0; index < repsize; index++) {
8514 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8515 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008517 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
8520 else if (x==enc_FAILED) {
8521 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008522 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
8526 *inpos = newpos;
8527 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 }
8529 return 0;
8530}
8531
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533_PyUnicode_EncodeCharmap(PyObject *unicode,
8534 PyObject *mapping,
8535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* output object */
8538 PyObject *res = NULL;
8539 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008544 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008546 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008547 void *data;
8548 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Benjamin Petersonbac79492012-01-14 13:34:47 -05008550 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551 return NULL;
8552 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008553 data = PyUnicode_DATA(unicode);
8554 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 /* Default to Latin-1 */
8557 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* allocate enough for a simple encoding without
8561 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008562 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (res == NULL)
8564 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008565 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008569 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 if (x==enc_EXCEPTION) /* error */
8573 goto onError;
8574 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008575 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008577 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 &res, &respos)) {
8579 goto onError;
8580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 else
8583 /* done with this character => adjust input position */
8584 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008589 if (_PyBytes_Resize(&res, respos) < 0)
8590 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008593 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 return res;
8595
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_XDECREF(res);
8598 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008599 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 return NULL;
8601}
8602
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603/* Deprecated */
8604PyObject *
8605PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8606 Py_ssize_t size,
8607 PyObject *mapping,
8608 const char *errors)
8609{
8610 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008611 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 if (unicode == NULL)
8613 return NULL;
8614 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8615 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008616 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617}
8618
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619PyObject *
8620PyUnicode_AsCharmapString(PyObject *unicode,
8621 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
8623 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 PyErr_BadArgument();
8625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008627 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631static void
8632make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008634 Py_ssize_t startpos, Py_ssize_t endpos,
8635 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 *exceptionObject = _PyUnicodeTranslateError_Create(
8639 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
8641 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8643 goto onError;
8644 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8645 goto onError;
8646 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8647 goto onError;
8648 return;
8649 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008650 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
8652}
8653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654/* error handling callback helper:
8655 build arguments, call the callback and check the arguments,
8656 put the result into newpos and return the replacement string, which
8657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659unicode_translate_call_errorhandler(const char *errors,
8660 PyObject **errorHandler,
8661 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008666 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008668 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *restuple;
8670 PyObject *resunicode;
8671
8672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677
8678 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008683 restuple = PyObject_CallFunctionObjArgs(
8684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 &resunicode, &i_newpos)) {
8694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 else
8700 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 Py_INCREF(resunicode);
8707 Py_DECREF(restuple);
8708 return resunicode;
8709}
8710
8711/* Lookup the character ch in the mapping and put the result in result,
8712 which must be decrefed by the caller.
8713 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716{
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 PyObject *x;
8719
8720 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 x = PyObject_GetItem(mapping, w);
8723 Py_DECREF(w);
8724 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8726 /* No mapping found means: use 1:1 mapping. */
8727 PyErr_Clear();
8728 *result = NULL;
8729 return 0;
8730 } else
8731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
8733 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *result = x;
8735 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008737 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008739 if (value < 0 || value > MAX_UNICODE) {
8740 PyErr_Format(PyExc_ValueError,
8741 "character mapping must be in range(0x%x)",
8742 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(x);
8744 return -1;
8745 }
8746 *result = x;
8747 return 0;
8748 }
8749 else if (PyUnicode_Check(x)) {
8750 *result = x;
8751 return 0;
8752 }
8753 else {
8754 /* wrong return value */
8755 PyErr_SetString(PyExc_TypeError,
8756 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 Py_DECREF(x);
8758 return -1;
8759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
Victor Stinner1194ea02014-04-04 19:37:40 +02008761
8762/* lookup the character, write the result into the writer.
8763 Return 1 if the result was written into the writer, return 0 if the mapping
8764 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008766charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8767 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 PyObject *item;
8770
8771 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008773
8774 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008776 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008779 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008781
8782 if (item == Py_None) {
8783 Py_DECREF(item);
8784 return 0;
8785 }
8786
8787 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008788 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8789 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8790 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008791 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8792 Py_DECREF(item);
8793 return -1;
8794 }
8795 Py_DECREF(item);
8796 return 1;
8797 }
8798
8799 if (!PyUnicode_Check(item)) {
8800 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008802 }
8803
8804 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8805 Py_DECREF(item);
8806 return -1;
8807 }
8808
8809 Py_DECREF(item);
8810 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811}
8812
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813static int
8814unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8815 Py_UCS1 *translate)
8816{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 int ret = 0;
8819
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 if (charmaptranslate_lookup(ch, mapping, &item)) {
8821 return -1;
8822 }
8823
8824 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008826 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 /* not found => default to 1:1 mapping */
8830 translate[ch] = ch;
8831 return 1;
8832 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008833 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008834 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008835 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8836 used it */
8837 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838 /* invalid character or character outside ASCII:
8839 skip the fast translate */
8840 goto exit;
8841 }
8842 translate[ch] = (Py_UCS1)replace;
8843 }
8844 else if (PyUnicode_Check(item)) {
8845 Py_UCS4 replace;
8846
8847 if (PyUnicode_READY(item) == -1) {
8848 Py_DECREF(item);
8849 return -1;
8850 }
8851 if (PyUnicode_GET_LENGTH(item) != 1)
8852 goto exit;
8853
8854 replace = PyUnicode_READ_CHAR(item, 0);
8855 if (replace > 127)
8856 goto exit;
8857 translate[ch] = (Py_UCS1)replace;
8858 }
8859 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 goto exit;
8862 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 ret = 1;
8864
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 exit:
8866 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 return ret;
8868}
8869
8870/* Fast path for ascii => ascii translation. Return 1 if the whole string
8871 was translated into writer, return 0 if the input string was partially
8872 translated into writer, raise an exception and return -1 on error. */
8873static int
8874unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008875 _PyUnicodeWriter *writer, int ignore,
8876 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877{
Victor Stinner872b2912014-04-05 14:27:07 +02008878 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 Py_ssize_t len;
8880 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008881 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 len = PyUnicode_GET_LENGTH(input);
8884
Victor Stinner872b2912014-04-05 14:27:07 +02008885 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886
8887 in = PyUnicode_1BYTE_DATA(input);
8888 end = in + len;
8889
8890 assert(PyUnicode_IS_ASCII(writer->buffer));
8891 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8892 out = PyUnicode_1BYTE_DATA(writer->buffer);
8893
Victor Stinner872b2912014-04-05 14:27:07 +02008894 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008896 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008898 int translate = unicode_fast_translate_lookup(mapping, ch,
8899 ascii_table);
8900 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (translate == 0)
8903 goto exit;
8904 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008905 }
Victor Stinner872b2912014-04-05 14:27:07 +02008906 if (ch2 == 0xfe) {
8907 if (ignore)
8908 continue;
8909 goto exit;
8910 }
8911 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008913 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 }
Victor Stinner872b2912014-04-05 14:27:07 +02008915 res = 1;
8916
8917exit:
8918 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008919 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008920 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921}
8922
Victor Stinner3222da22015-10-01 22:07:32 +02008923static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924_PyUnicode_TranslateCharmap(PyObject *input,
8925 PyObject *mapping,
8926 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 Py_ssize_t size, i;
8931 int kind;
8932 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 _PyUnicodeWriter writer;
8934 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935 char *reason = "character maps to <undefined>";
8936 PyObject *errorHandler = NULL;
8937 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 PyErr_BadArgument();
8943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 if (PyUnicode_READY(input) == -1)
8947 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008948 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind = PyUnicode_KIND(input);
8950 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008952 if (size == 0)
8953 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 /* allocate enough for a simple 1:1 translation without
8956 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008957 _PyUnicodeWriter_Init(&writer);
8958 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960
Victor Stinner872b2912014-04-05 14:27:07 +02008961 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8962
Victor Stinner33798672016-03-01 21:59:58 +01008963 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008965 if (PyUnicode_IS_ASCII(input)) {
8966 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8967 if (res < 0) {
8968 _PyUnicodeWriter_Dealloc(&writer);
8969 return NULL;
8970 }
8971 if (res == 1)
8972 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 }
Victor Stinner33798672016-03-01 21:59:58 +01008974 else {
8975 i = 0;
8976 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 int translate;
8981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8982 Py_ssize_t newpos;
8983 /* startpos for collecting untranslatable chars */
8984 Py_ssize_t collstart;
8985 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 ch = PyUnicode_READ(kind, data, i);
8989 translate = charmaptranslate_output(ch, mapping, &writer);
8990 if (translate < 0)
8991 goto onError;
8992
8993 if (translate != 0) {
8994 /* it worked => adjust input pointer */
8995 ++i;
8996 continue;
8997 }
8998
8999 /* untranslatable character */
9000 collstart = i;
9001 collend = i+1;
9002
9003 /* find all untranslatable characters */
9004 while (collend < size) {
9005 PyObject *x;
9006 ch = PyUnicode_READ(kind, data, collend);
9007 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 Py_XDECREF(x);
9010 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 ++collend;
9013 }
9014
9015 if (ignore) {
9016 i = collend;
9017 }
9018 else {
9019 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9020 reason, input, &exc,
9021 collstart, collend, &newpos);
9022 if (repunicode == NULL)
9023 goto onError;
9024 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009028 Py_DECREF(repunicode);
9029 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 }
9031 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 Py_XDECREF(exc);
9033 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 Py_XDECREF(exc);
9039 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 return NULL;
9041}
9042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043/* Deprecated. Use PyUnicode_Translate instead. */
9044PyObject *
9045PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9046 Py_ssize_t size,
9047 PyObject *mapping,
9048 const char *errors)
9049{
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009051 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (!unicode)
9053 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009054 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9055 Py_DECREF(unicode);
9056 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057}
9058
Alexander Belopolsky40018472011-02-26 01:02:56 +00009059PyObject *
9060PyUnicode_Translate(PyObject *str,
9061 PyObject *mapping,
9062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009064 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009065 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009066 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
Tim Petersced69f82003-09-16 20:30:58 +00009068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009070fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071{
9072 /* No need to call PyUnicode_READY(self) because this function is only
9073 called as a callback from fixup() which does it already. */
9074 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9075 const int kind = PyUnicode_KIND(self);
9076 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009077 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009078 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 Py_ssize_t i;
9080
9081 for (i = 0; i < len; ++i) {
9082 ch = PyUnicode_READ(kind, data, i);
9083 fixed = 0;
9084 if (ch > 127) {
9085 if (Py_UNICODE_ISSPACE(ch))
9086 fixed = ' ';
9087 else {
9088 const int decimal = Py_UNICODE_TODECIMAL(ch);
9089 if (decimal >= 0)
9090 fixed = '0' + decimal;
9091 }
9092 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009093 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009094 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 PyUnicode_WRITE(kind, data, i, fixed);
9096 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009097 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009098 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 }
9101
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009102 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103}
9104
9105PyObject *
9106_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9107{
9108 if (!PyUnicode_Check(unicode)) {
9109 PyErr_BadInternalCall();
9110 return NULL;
9111 }
9112 if (PyUnicode_READY(unicode) == -1)
9113 return NULL;
9114 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9115 /* If the string is already ASCII, just return the same string */
9116 Py_INCREF(unicode);
9117 return unicode;
9118 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009119 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120}
9121
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009122PyObject *
9123PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9124 Py_ssize_t length)
9125{
Victor Stinnerf0124502011-11-21 23:12:56 +01009126 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009127 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009128 Py_UCS4 maxchar;
9129 enum PyUnicode_Kind kind;
9130 void *data;
9131
Victor Stinner99d7ad02012-02-22 13:37:39 +01009132 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009133 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009134 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009135 if (ch > 127) {
9136 int decimal = Py_UNICODE_TODECIMAL(ch);
9137 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009138 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009139 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009140 }
9141 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009142
9143 /* Copy to a new string */
9144 decimal = PyUnicode_New(length, maxchar);
9145 if (decimal == NULL)
9146 return decimal;
9147 kind = PyUnicode_KIND(decimal);
9148 data = PyUnicode_DATA(decimal);
9149 /* Iterate over code points */
9150 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009151 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009152 if (ch > 127) {
9153 int decimal = Py_UNICODE_TODECIMAL(ch);
9154 if (decimal >= 0)
9155 ch = '0' + decimal;
9156 }
9157 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009159 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009160}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009161/* --- Decimal Encoder ---------------------------------------------------- */
9162
Alexander Belopolsky40018472011-02-26 01:02:56 +00009163int
9164PyUnicode_EncodeDecimal(Py_UNICODE *s,
9165 Py_ssize_t length,
9166 char *output,
9167 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009168{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009169 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009170 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009171 enum PyUnicode_Kind kind;
9172 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173
9174 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 PyErr_BadArgument();
9176 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009177 }
9178
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009179 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 if (unicode == NULL)
9181 return -1;
9182
Victor Stinner42bf7752011-11-21 22:52:58 +01009183 kind = PyUnicode_KIND(unicode);
9184 data = PyUnicode_DATA(unicode);
9185
Victor Stinnerb84d7232011-11-22 01:50:07 +01009186 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009187 PyObject *exc;
9188 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009190 Py_ssize_t startpos;
9191
9192 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009193
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009196 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 decimal = Py_UNICODE_TODECIMAL(ch);
9200 if (decimal >= 0) {
9201 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009202 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 continue;
9204 }
9205 if (0 < ch && ch < 256) {
9206 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009207 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 continue;
9209 }
Victor Stinner6345be92011-11-25 20:09:01 +01009210
Victor Stinner42bf7752011-11-21 22:52:58 +01009211 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009212 exc = NULL;
9213 raise_encode_exception(&exc, "decimal", unicode,
9214 startpos, startpos+1,
9215 "invalid decimal Unicode string");
9216 Py_XDECREF(exc);
9217 Py_DECREF(unicode);
9218 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 }
9220 /* 0-terminate the output string */
9221 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009222 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009223 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009224}
9225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226/* --- Helpers ------------------------------------------------------------ */
9227
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009228/* helper macro to fixup start/end slice values */
9229#define ADJUST_INDICES(start, end, len) \
9230 if (end > len) \
9231 end = len; \
9232 else if (end < 0) { \
9233 end += len; \
9234 if (end < 0) \
9235 end = 0; \
9236 } \
9237 if (start < 0) { \
9238 start += len; \
9239 if (start < 0) \
9240 start = 0; \
9241 }
9242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009244any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009246 Py_ssize_t end,
9247 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 void *buf1, *buf2;
9251 Py_ssize_t len1, len2, result;
9252
9253 kind1 = PyUnicode_KIND(s1);
9254 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009255 if (kind1 < kind2)
9256 return -1;
9257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 len1 = PyUnicode_GET_LENGTH(s1);
9259 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009260 ADJUST_INDICES(start, end, len1);
9261 if (end - start < len2)
9262 return -1;
9263
9264 buf1 = PyUnicode_DATA(s1);
9265 buf2 = PyUnicode_DATA(s2);
9266 if (len2 == 1) {
9267 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9268 result = findchar((const char *)buf1 + kind1*start,
9269 kind1, end - start, ch, direction);
9270 if (result == -1)
9271 return -1;
9272 else
9273 return start + result;
9274 }
9275
9276 if (kind2 != kind1) {
9277 buf2 = _PyUnicode_AsKind(s2, kind1);
9278 if (!buf2)
9279 return -2;
9280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281
Victor Stinner794d5672011-10-10 03:21:36 +02009282 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009283 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009284 case PyUnicode_1BYTE_KIND:
9285 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9286 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9287 else
9288 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_2BYTE_KIND:
9291 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 case PyUnicode_4BYTE_KIND:
9294 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9295 break;
9296 default:
9297 assert(0); result = -2;
9298 }
9299 }
9300 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009301 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009302 case PyUnicode_1BYTE_KIND:
9303 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9304 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305 else
9306 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 case PyUnicode_2BYTE_KIND:
9309 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 case PyUnicode_4BYTE_KIND:
9312 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9313 break;
9314 default:
9315 assert(0); result = -2;
9316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 }
9318
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009319 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 PyMem_Free(buf2);
9321
9322 return result;
9323}
9324
9325Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009326_PyUnicode_InsertThousandsGrouping(
9327 PyObject *unicode, Py_ssize_t index,
9328 Py_ssize_t n_buffer,
9329 void *digits, Py_ssize_t n_digits,
9330 Py_ssize_t min_width,
9331 const char *grouping, PyObject *thousands_sep,
9332 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333{
Victor Stinner41a863c2012-02-24 00:37:51 +01009334 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009335 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 Py_ssize_t thousands_sep_len;
9337 Py_ssize_t len;
9338
9339 if (unicode != NULL) {
9340 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009341 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009342 }
9343 else {
9344 kind = PyUnicode_1BYTE_KIND;
9345 data = NULL;
9346 }
9347 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9348 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9349 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9350 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009351 if (thousands_sep_kind < kind) {
9352 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9353 if (!thousands_sep_data)
9354 return -1;
9355 }
9356 else {
9357 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9358 if (!data)
9359 return -1;
9360 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 }
9362
Benjamin Petersonead6b532011-12-20 17:23:42 -06009363 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009365 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009367 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009369 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009370 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009372 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009378 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009384 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009386 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009387 break;
9388 default:
9389 assert(0);
9390 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009392 if (unicode != NULL && thousands_sep_kind != kind) {
9393 if (thousands_sep_kind < kind)
9394 PyMem_Free(thousands_sep_data);
9395 else
9396 PyMem_Free(data);
9397 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 if (unicode == NULL) {
9399 *maxchar = 127;
9400 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009401 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009402 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009403 }
9404 }
9405 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406}
9407
9408
Alexander Belopolsky40018472011-02-26 01:02:56 +00009409Py_ssize_t
9410PyUnicode_Count(PyObject *str,
9411 PyObject *substr,
9412 Py_ssize_t start,
9413 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009415 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 void *buf1 = NULL, *buf2 = NULL;
9418 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 kind1 = PyUnicode_KIND(str);
9424 kind2 = PyUnicode_KIND(substr);
9425 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 len1 = PyUnicode_GET_LENGTH(str);
9429 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf1 = PyUnicode_DATA(str);
9435 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009436 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009437 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438 if (!buf2)
9439 goto onError;
9440 }
9441
9442 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009445 result = asciilib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
9449 else
9450 result = ucs1lib_count(
9451 ((Py_UCS1*)buf1) + start, end - start,
9452 buf2, len2, PY_SSIZE_T_MAX
9453 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 break;
9455 case PyUnicode_2BYTE_KIND:
9456 result = ucs2lib_count(
9457 ((Py_UCS2*)buf1) + start, end - start,
9458 buf2, len2, PY_SSIZE_T_MAX
9459 );
9460 break;
9461 case PyUnicode_4BYTE_KIND:
9462 result = ucs4lib_count(
9463 ((Py_UCS4*)buf1) + start, end - start,
9464 buf2, len2, PY_SSIZE_T_MAX
9465 );
9466 break;
9467 default:
9468 assert(0); result = 0;
9469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009470
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009476 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 PyMem_Free(buf2);
9478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481Py_ssize_t
9482PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009484 Py_ssize_t start,
9485 Py_ssize_t end,
9486 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009490
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009491 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494Py_ssize_t
9495PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9496 Py_ssize_t start, Py_ssize_t end,
9497 int direction)
9498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009500 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 if (PyUnicode_READY(str) == -1)
9502 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009503 len = PyUnicode_GET_LENGTH(str);
9504 ADJUST_INDICES(start, end, len);
9505 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9509 kind, end-start, ch, direction);
9510 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009512 else
9513 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009517tailmatch(PyObject *self,
9518 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519 Py_ssize_t start,
9520 Py_ssize_t end,
9521 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 int kind_self;
9524 int kind_sub;
9525 void *data_self;
9526 void *data_sub;
9527 Py_ssize_t offset;
9528 Py_ssize_t i;
9529 Py_ssize_t end_sub;
9530
9531 if (PyUnicode_READY(self) == -1 ||
9532 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9536 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009540 if (PyUnicode_GET_LENGTH(substring) == 0)
9541 return 1;
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind_self = PyUnicode_KIND(self);
9544 data_self = PyUnicode_DATA(self);
9545 kind_sub = PyUnicode_KIND(substring);
9546 data_sub = PyUnicode_DATA(substring);
9547 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9548
9549 if (direction > 0)
9550 offset = end;
9551 else
9552 offset = start;
9553
9554 if (PyUnicode_READ(kind_self, data_self, offset) ==
9555 PyUnicode_READ(kind_sub, data_sub, 0) &&
9556 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9557 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9558 /* If both are of the same kind, memcmp is sufficient */
9559 if (kind_self == kind_sub) {
9560 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 data_sub,
9563 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009566 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 else {
9568 /* We do not need to compare 0 and len(substring)-1 because
9569 the if statement above ensured already that they are equal
9570 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 for (i = 1; i < end_sub; ++i) {
9572 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9573 PyUnicode_READ(kind_sub, data_sub, i))
9574 return 0;
9575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return 0;
9581}
9582
Alexander Belopolsky40018472011-02-26 01:02:56 +00009583Py_ssize_t
9584PyUnicode_Tailmatch(PyObject *str,
9585 PyObject *substr,
9586 Py_ssize_t start,
9587 Py_ssize_t end,
9588 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596/* Apply fixfct filter to the Unicode object self and return a
9597 reference to the modified object */
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009600fixup(PyObject *self,
9601 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 PyObject *u;
9604 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009605 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009607 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009610 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 /* fix functions return the new maximum character in a string,
9613 if the kind of the resulting unicode object does not change,
9614 everything is fine. Otherwise we need to change the string kind
9615 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009616 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009617
9618 if (maxchar_new == 0) {
9619 /* no changes */;
9620 if (PyUnicode_CheckExact(self)) {
9621 Py_DECREF(u);
9622 Py_INCREF(self);
9623 return self;
9624 }
9625 else
9626 return u;
9627 }
9628
Victor Stinnere6abb482012-05-02 01:15:40 +02009629 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630
Victor Stinnereaab6042011-12-11 22:22:39 +01009631 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009633
9634 /* In case the maximum character changed, we need to
9635 convert the string to the new category. */
9636 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9637 if (v == NULL) {
9638 Py_DECREF(u);
9639 return NULL;
9640 }
9641 if (maxchar_new > maxchar_old) {
9642 /* If the maxchar increased so that the kind changed, not all
9643 characters are representable anymore and we need to fix the
9644 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009645 _PyUnicode_FastCopyCharacters(v, 0,
9646 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009647 maxchar_old = fixfct(v);
9648 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 }
9650 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009651 _PyUnicode_FastCopyCharacters(v, 0,
9652 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009654 Py_DECREF(u);
9655 assert(_PyUnicode_CheckConsistency(v, 1));
9656 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659static PyObject *
9660ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9663 char *resdata, *data = PyUnicode_DATA(self);
9664 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res = PyUnicode_New(len, 127);
9667 if (res == NULL)
9668 return NULL;
9669 resdata = PyUnicode_DATA(res);
9670 if (lower)
9671 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 _Py_bytes_upper(resdata, data, len);
9674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 Py_ssize_t j;
9681 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009682 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9686
9687 where ! is a negation and \p{xxx} is a character with property xxx.
9688 */
9689 for (j = i - 1; j >= 0; j--) {
9690 c = PyUnicode_READ(kind, data, j);
9691 if (!_PyUnicode_IsCaseIgnorable(c))
9692 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9695 if (final_sigma) {
9696 for (j = i + 1; j < length; j++) {
9697 c = PyUnicode_READ(kind, data, j);
9698 if (!_PyUnicode_IsCaseIgnorable(c))
9699 break;
9700 }
9701 final_sigma = j == length || !_PyUnicode_IsCased(c);
9702 }
9703 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
9705
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706static int
9707lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9708 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 /* Obscure special case. */
9711 if (c == 0x3A3) {
9712 mapped[0] = handle_capital_sigma(kind, data, length, i);
9713 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
9717
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718static Py_ssize_t
9719do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 Py_ssize_t i, k = 0;
9722 int n_res, j;
9723 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009724
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 c = PyUnicode_READ(kind, data, 0);
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009728 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 for (i = 1; i < length; i++) {
9732 c = PyUnicode_READ(kind, data, i);
9733 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9734 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009735 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009737 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742static Py_ssize_t
9743do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (Py_UNICODE_ISUPPER(c)) {
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 }
9752 else if (Py_UNICODE_ISLOWER(c)) {
9753 n_res = _PyUnicode_ToUpperFull(c, mapped);
9754 }
9755 else {
9756 n_res = 1;
9757 mapped[0] = c;
9758 }
9759 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009760 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res[k++] = mapped[j];
9762 }
9763 }
9764 return k;
9765}
9766
9767static Py_ssize_t
9768do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9769 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 Py_ssize_t i, k = 0;
9772
9773 for (i = 0; i < length; i++) {
9774 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9775 int n_res, j;
9776 if (lower)
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 else
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009781 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 res[k++] = mapped[j];
9783 }
9784 }
9785 return k;
9786}
9787
9788static Py_ssize_t
9789do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9790{
9791 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9792}
9793
9794static Py_ssize_t
9795do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9798}
9799
Benjamin Petersone51757f2012-01-12 21:10:29 -05009800static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009801do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9802{
9803 Py_ssize_t i, k = 0;
9804
9805 for (i = 0; i < length; i++) {
9806 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9807 Py_UCS4 mapped[3];
9808 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9809 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009810 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009811 res[k++] = mapped[j];
9812 }
9813 }
9814 return k;
9815}
9816
9817static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009818do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819{
9820 Py_ssize_t i, k = 0;
9821 int previous_is_cased;
9822
9823 previous_is_cased = 0;
9824 for (i = 0; i < length; i++) {
9825 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9826 Py_UCS4 mapped[3];
9827 int n_res, j;
9828
9829 if (previous_is_cased)
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 else
9832 n_res = _PyUnicode_ToTitleFull(c, mapped);
9833
9834 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009835 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009836 res[k++] = mapped[j];
9837 }
9838
9839 previous_is_cased = _PyUnicode_IsCased(c);
9840 }
9841 return k;
9842}
9843
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844static PyObject *
9845case_operation(PyObject *self,
9846 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9847{
9848 PyObject *res = NULL;
9849 Py_ssize_t length, newlength = 0;
9850 int kind, outkind;
9851 void *data, *outdata;
9852 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9853
Benjamin Petersoneea48462012-01-16 14:28:50 -05009854 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855
9856 kind = PyUnicode_KIND(self);
9857 data = PyUnicode_DATA(self);
9858 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009859 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009860 PyErr_SetString(PyExc_OverflowError, "string is too long");
9861 return NULL;
9862 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009863 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 if (tmp == NULL)
9865 return PyErr_NoMemory();
9866 newlength = perform(kind, data, length, tmp, &maxchar);
9867 res = PyUnicode_New(newlength, maxchar);
9868 if (res == NULL)
9869 goto leave;
9870 tmpend = tmp + newlength;
9871 outdata = PyUnicode_DATA(res);
9872 outkind = PyUnicode_KIND(res);
9873 switch (outkind) {
9874 case PyUnicode_1BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9879 break;
9880 case PyUnicode_4BYTE_KIND:
9881 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9882 break;
9883 default:
9884 assert(0);
9885 break;
9886 }
9887 leave:
9888 PyMem_FREE(tmp);
9889 return res;
9890}
9891
Tim Peters8ce9f162004-08-27 01:49:32 +00009892PyObject *
9893PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009895 PyObject *res;
9896 PyObject *fseq;
9897 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009898 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009900 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009901 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009902 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009903 }
9904
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905 /* NOTE: the following code can't call back into Python code,
9906 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009907 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009908
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009909 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009910 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009911 res = _PyUnicode_JoinArray(separator, items, seqlen);
9912 Py_DECREF(fseq);
9913 return res;
9914}
9915
9916PyObject *
9917_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9918{
9919 PyObject *res = NULL; /* the result */
9920 PyObject *sep = NULL;
9921 Py_ssize_t seplen;
9922 PyObject *item;
9923 Py_ssize_t sz, i, res_offset;
9924 Py_UCS4 maxchar;
9925 Py_UCS4 item_maxchar;
9926 int use_memcpy;
9927 unsigned char *res_data = NULL, *sep_data = NULL;
9928 PyObject *last_obj;
9929 unsigned int kind = 0;
9930
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 /* If empty sequence, return u"". */
9932 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009933 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009935
Tim Peters05eba1f2004-08-27 21:32:02 +00009936 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009937 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009938 if (seqlen == 1) {
9939 if (PyUnicode_CheckExact(items[0])) {
9940 res = items[0];
9941 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009942 return res;
9943 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009945 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009946 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009948 /* Set up sep and seplen */
9949 if (separator == NULL) {
9950 /* fall back to a blank space separator */
9951 sep = PyUnicode_FromOrdinal(' ');
9952 if (!sep)
9953 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009954 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009955 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009956 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009957 else {
9958 if (!PyUnicode_Check(separator)) {
9959 PyErr_Format(PyExc_TypeError,
9960 "separator: expected str instance,"
9961 " %.80s found",
9962 Py_TYPE(separator)->tp_name);
9963 goto onError;
9964 }
9965 if (PyUnicode_READY(separator))
9966 goto onError;
9967 sep = separator;
9968 seplen = PyUnicode_GET_LENGTH(separator);
9969 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9970 /* inc refcount to keep this code path symmetric with the
9971 above case of a blank separator */
9972 Py_INCREF(sep);
9973 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009975 }
9976
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 /* There are at least two things to join, or else we have a subclass
9978 * of str in the sequence.
9979 * Do a pre-pass to figure out the total amount of space we'll
9980 * need (sz), and see whether all argument are strings.
9981 */
9982 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009983#ifdef Py_DEBUG
9984 use_memcpy = 0;
9985#else
9986 use_memcpy = 1;
9987#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009989 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009990 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 if (!PyUnicode_Check(item)) {
9992 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009993 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 " %.80s found",
9995 i, Py_TYPE(item)->tp_name);
9996 goto onError;
9997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (PyUnicode_READY(item) == -1)
9999 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010000 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010002 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010003 if (i != 0) {
10004 add_sz += seplen;
10005 }
10006 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 goto onError;
10010 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010011 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010012 if (use_memcpy && last_obj != NULL) {
10013 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10014 use_memcpy = 0;
10015 }
10016 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 }
Tim Petersced69f82003-09-16 20:30:58 +000010018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 if (res == NULL)
10021 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010022
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010023 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010024#ifdef Py_DEBUG
10025 use_memcpy = 0;
10026#else
10027 if (use_memcpy) {
10028 res_data = PyUnicode_1BYTE_DATA(res);
10029 kind = PyUnicode_KIND(res);
10030 if (seplen != 0)
10031 sep_data = PyUnicode_1BYTE_DATA(sep);
10032 }
10033#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010034 if (use_memcpy) {
10035 for (i = 0; i < seqlen; ++i) {
10036 Py_ssize_t itemlen;
10037 item = items[i];
10038
10039 /* Copy item, and maybe the separator. */
10040 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010041 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010043 kind * seplen);
10044 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010046
10047 itemlen = PyUnicode_GET_LENGTH(item);
10048 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010049 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010051 kind * itemlen);
10052 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010054 }
10055 assert(res_data == PyUnicode_1BYTE_DATA(res)
10056 + kind * PyUnicode_GET_LENGTH(res));
10057 }
10058 else {
10059 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10060 Py_ssize_t itemlen;
10061 item = items[i];
10062
10063 /* Copy item, and maybe the separator. */
10064 if (i && seplen != 0) {
10065 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10066 res_offset += seplen;
10067 }
10068
10069 itemlen = PyUnicode_GET_LENGTH(item);
10070 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010071 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 res_offset += itemlen;
10073 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010074 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010076 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010079 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081
Benjamin Peterson29060642009-01-31 22:14:21 +000010082 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010084 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085 return NULL;
10086}
10087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088#define FILL(kind, data, value, start, length) \
10089 do { \
10090 Py_ssize_t i_ = 0; \
10091 assert(kind != PyUnicode_WCHAR_KIND); \
10092 switch ((kind)) { \
10093 case PyUnicode_1BYTE_KIND: { \
10094 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010095 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 break; \
10097 } \
10098 case PyUnicode_2BYTE_KIND: { \
10099 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10100 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10101 break; \
10102 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010103 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10105 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10106 break; \
10107 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010108 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 } \
10110 } while (0)
10111
Victor Stinnerd3f08822012-05-29 12:57:52 +020010112void
10113_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10114 Py_UCS4 fill_char)
10115{
10116 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10117 const void *data = PyUnicode_DATA(unicode);
10118 assert(PyUnicode_IS_READY(unicode));
10119 assert(unicode_modifiable(unicode));
10120 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10121 assert(start >= 0);
10122 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10123 FILL(kind, data, fill_char, start, length);
10124}
10125
Victor Stinner3fe55312012-01-04 00:33:50 +010010126Py_ssize_t
10127PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10128 Py_UCS4 fill_char)
10129{
10130 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010131
10132 if (!PyUnicode_Check(unicode)) {
10133 PyErr_BadInternalCall();
10134 return -1;
10135 }
10136 if (PyUnicode_READY(unicode) == -1)
10137 return -1;
10138 if (unicode_check_modifiable(unicode))
10139 return -1;
10140
Victor Stinnerd3f08822012-05-29 12:57:52 +020010141 if (start < 0) {
10142 PyErr_SetString(PyExc_IndexError, "string index out of range");
10143 return -1;
10144 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010145 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10146 PyErr_SetString(PyExc_ValueError,
10147 "fill character is bigger than "
10148 "the string maximum character");
10149 return -1;
10150 }
10151
10152 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10153 length = Py_MIN(maxlen, length);
10154 if (length <= 0)
10155 return 0;
10156
Victor Stinnerd3f08822012-05-29 12:57:52 +020010157 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010158 return length;
10159}
10160
Victor Stinner9310abb2011-10-05 00:59:23 +020010161static PyObject *
10162pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010163 Py_ssize_t left,
10164 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 PyObject *u;
10168 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010169 int kind;
10170 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
10172 if (left < 0)
10173 left = 0;
10174 if (right < 0)
10175 right = 0;
10176
Victor Stinnerc4b49542011-12-11 22:44:26 +010010177 if (left == 0 && right == 0)
10178 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10181 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010182 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10183 return NULL;
10184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010186 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010188 if (!u)
10189 return NULL;
10190
10191 kind = PyUnicode_KIND(u);
10192 data = PyUnicode_DATA(u);
10193 if (left)
10194 FILL(kind, data, fill, 0, left);
10195 if (right)
10196 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010197 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010198 assert(_PyUnicode_CheckConsistency(u, 1));
10199 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200}
10201
Alexander Belopolsky40018472011-02-26 01:02:56 +000010202PyObject *
10203PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010207 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Benjamin Petersonead6b532011-12-20 17:23:42 -060010210 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 if (PyUnicode_IS_ASCII(string))
10213 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010214 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 PyUnicode_GET_LENGTH(string), keepends);
10216 else
10217 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010219 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 break;
10221 case PyUnicode_2BYTE_KIND:
10222 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 PyUnicode_GET_LENGTH(string), keepends);
10225 break;
10226 case PyUnicode_4BYTE_KIND:
10227 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010228 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 PyUnicode_GET_LENGTH(string), keepends);
10230 break;
10231 default:
10232 assert(0);
10233 list = 0;
10234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236}
10237
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010239split(PyObject *self,
10240 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010243 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 void *buf1, *buf2;
10245 Py_ssize_t len1, len2;
10246 PyObject* out;
10247
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010249 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (PyUnicode_READY(self) == -1)
10252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010255 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 if (PyUnicode_IS_ASCII(self))
10258 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010259 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 PyUnicode_GET_LENGTH(self), maxcount
10261 );
10262 else
10263 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 case PyUnicode_2BYTE_KIND:
10268 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
10272 case PyUnicode_4BYTE_KIND:
10273 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 default:
10278 assert(0);
10279 return NULL;
10280 }
10281
10282 if (PyUnicode_READY(substring) == -1)
10283 return NULL;
10284
10285 kind1 = PyUnicode_KIND(self);
10286 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 len1 = PyUnicode_GET_LENGTH(self);
10288 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010289 if (kind1 < kind2 || len1 < len2) {
10290 out = PyList_New(1);
10291 if (out == NULL)
10292 return NULL;
10293 Py_INCREF(self);
10294 PyList_SET_ITEM(out, 0, self);
10295 return out;
10296 }
10297 buf1 = PyUnicode_DATA(self);
10298 buf2 = PyUnicode_DATA(substring);
10299 if (kind2 != kind1) {
10300 buf2 = _PyUnicode_AsKind(substring, kind1);
10301 if (!buf2)
10302 return NULL;
10303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010305 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10308 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010310 else
10311 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 case PyUnicode_2BYTE_KIND:
10315 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 break;
10318 case PyUnicode_4BYTE_KIND:
10319 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 break;
10322 default:
10323 out = NULL;
10324 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010325 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 PyMem_Free(buf2);
10327 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Alexander Belopolsky40018472011-02-26 01:02:56 +000010330static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010331rsplit(PyObject *self,
10332 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010333 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010335 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 void *buf1, *buf2;
10337 Py_ssize_t len1, len2;
10338 PyObject* out;
10339
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010340 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010341 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (PyUnicode_READY(self) == -1)
10344 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010347 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 if (PyUnicode_IS_ASCII(self))
10350 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010351 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352 PyUnicode_GET_LENGTH(self), maxcount
10353 );
10354 else
10355 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010357 PyUnicode_GET_LENGTH(self), maxcount
10358 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 case PyUnicode_2BYTE_KIND:
10360 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 PyUnicode_GET_LENGTH(self), maxcount
10363 );
10364 case PyUnicode_4BYTE_KIND:
10365 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010366 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 PyUnicode_GET_LENGTH(self), maxcount
10368 );
10369 default:
10370 assert(0);
10371 return NULL;
10372 }
10373
10374 if (PyUnicode_READY(substring) == -1)
10375 return NULL;
10376
10377 kind1 = PyUnicode_KIND(self);
10378 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 len1 = PyUnicode_GET_LENGTH(self);
10380 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010381 if (kind1 < kind2 || len1 < len2) {
10382 out = PyList_New(1);
10383 if (out == NULL)
10384 return NULL;
10385 Py_INCREF(self);
10386 PyList_SET_ITEM(out, 0, self);
10387 return out;
10388 }
10389 buf1 = PyUnicode_DATA(self);
10390 buf2 = PyUnicode_DATA(substring);
10391 if (kind2 != kind1) {
10392 buf2 = _PyUnicode_AsKind(substring, kind1);
10393 if (!buf2)
10394 return NULL;
10395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010397 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10400 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 else
10403 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 case PyUnicode_2BYTE_KIND:
10407 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010408 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 break;
10410 case PyUnicode_4BYTE_KIND:
10411 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010412 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 break;
10414 default:
10415 out = NULL;
10416 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010417 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 PyMem_Free(buf2);
10419 return out;
10420}
10421
10422static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10424 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010426 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010428 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10429 return asciilib_find(buf1, len1, buf2, len2, offset);
10430 else
10431 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 case PyUnicode_2BYTE_KIND:
10433 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10434 case PyUnicode_4BYTE_KIND:
10435 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10436 }
10437 assert(0);
10438 return -1;
10439}
10440
10441static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010442anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10443 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010445 switch (kind) {
10446 case PyUnicode_1BYTE_KIND:
10447 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10448 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10449 else
10450 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10451 case PyUnicode_2BYTE_KIND:
10452 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10453 case PyUnicode_4BYTE_KIND:
10454 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10455 }
10456 assert(0);
10457 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010458}
10459
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010460static void
10461replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10462 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10463{
10464 int kind = PyUnicode_KIND(u);
10465 void *data = PyUnicode_DATA(u);
10466 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10467 if (kind == PyUnicode_1BYTE_KIND) {
10468 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10469 (Py_UCS1 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472 else if (kind == PyUnicode_2BYTE_KIND) {
10473 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10474 (Py_UCS2 *)data + len,
10475 u1, u2, maxcount);
10476 }
10477 else {
10478 assert(kind == PyUnicode_4BYTE_KIND);
10479 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10480 (Py_UCS4 *)data + len,
10481 u1, u2, maxcount);
10482 }
10483}
10484
Alexander Belopolsky40018472011-02-26 01:02:56 +000010485static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486replace(PyObject *self, PyObject *str1,
10487 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 PyObject *u;
10490 char *sbuf = PyUnicode_DATA(self);
10491 char *buf1 = PyUnicode_DATA(str1);
10492 char *buf2 = PyUnicode_DATA(str2);
10493 int srelease = 0, release1 = 0, release2 = 0;
10494 int skind = PyUnicode_KIND(self);
10495 int kind1 = PyUnicode_KIND(str1);
10496 int kind2 = PyUnicode_KIND(str2);
10497 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10498 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10499 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010500 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010501 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
10503 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010506 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507
Victor Stinner59de0ee2011-10-07 10:01:28 +020010508 if (str1 == str2)
10509 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510
Victor Stinner49a0a212011-10-12 23:46:10 +020010511 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010512 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10513 if (maxchar < maxchar_str1)
10514 /* substring too wide to be present */
10515 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010516 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10517 /* Replacing str1 with str2 may cause a maxchar reduction in the
10518 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010519 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010520 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010525 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010528 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010530
Victor Stinner69ed0f42013-04-09 21:48:24 +020010531 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010532 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010533 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010535 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010537 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010539
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010540 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10541 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010542 }
10543 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 int rkind = skind;
10545 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010546 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (kind1 < rkind) {
10549 /* widen substring */
10550 buf1 = _PyUnicode_AsKind(str1, rkind);
10551 if (!buf1) goto error;
10552 release1 = 1;
10553 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010555 if (i < 0)
10556 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 if (rkind > kind2) {
10558 /* widen replacement */
10559 buf2 = _PyUnicode_AsKind(str2, rkind);
10560 if (!buf2) goto error;
10561 release2 = 1;
10562 }
10563 else if (rkind < kind2) {
10564 /* widen self and buf1 */
10565 rkind = kind2;
10566 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010567 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 sbuf = _PyUnicode_AsKind(self, rkind);
10569 if (!sbuf) goto error;
10570 srelease = 1;
10571 buf1 = _PyUnicode_AsKind(str1, rkind);
10572 if (!buf1) goto error;
10573 release1 = 1;
10574 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 u = PyUnicode_New(slen, maxchar);
10576 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 assert(PyUnicode_KIND(u) == rkind);
10579 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010580
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010582 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587
10588 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010592 if (i == -1)
10593 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010600 }
10601 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010603 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 int rkind = skind;
10605 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010608 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 buf1 = _PyUnicode_AsKind(str1, rkind);
10610 if (!buf1) goto error;
10611 release1 = 1;
10612 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010613 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 if (n == 0)
10615 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf2 = _PyUnicode_AsKind(str2, rkind);
10619 if (!buf2) goto error;
10620 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010623 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 rkind = kind2;
10625 sbuf = _PyUnicode_AsKind(self, rkind);
10626 if (!sbuf) goto error;
10627 srelease = 1;
10628 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010629 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 buf1 = _PyUnicode_AsKind(str1, rkind);
10631 if (!buf1) goto error;
10632 release1 = 1;
10633 }
10634 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10635 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010636 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 PyErr_SetString(PyExc_OverflowError,
10638 "replace string is too long");
10639 goto error;
10640 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010641 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010642 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010643 _Py_INCREF_UNICODE_EMPTY();
10644 if (!unicode_empty)
10645 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 u = unicode_empty;
10647 goto done;
10648 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010649 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 PyErr_SetString(PyExc_OverflowError,
10651 "replace string is too long");
10652 goto error;
10653 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010654 u = PyUnicode_New(new_size, maxchar);
10655 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 assert(PyUnicode_KIND(u) == rkind);
10658 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 ires = i = 0;
10660 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 while (n-- > 0) {
10662 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010665 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010666 if (j == -1)
10667 break;
10668 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010670 memcpy(res + rkind * ires,
10671 sbuf + rkind * i,
10672 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 }
10675 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010679 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010686 memcpy(res + rkind * ires,
10687 sbuf + rkind * i,
10688 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010689 }
10690 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691 /* interleave */
10692 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 if (--n <= 0)
10698 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010699 memcpy(res + rkind * ires,
10700 sbuf + rkind * i,
10701 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 ires++;
10703 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010705 memcpy(res + rkind * ires,
10706 sbuf + rkind * i,
10707 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010709 }
10710
10711 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010712 unicode_adjust_maxchar(&u);
10713 if (u == NULL)
10714 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010716
10717 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (srelease)
10719 PyMem_FREE(sbuf);
10720 if (release1)
10721 PyMem_FREE(buf1);
10722 if (release2)
10723 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010724 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010728 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (srelease)
10730 PyMem_FREE(sbuf);
10731 if (release1)
10732 PyMem_FREE(buf1);
10733 if (release2)
10734 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010735 return unicode_result_unchanged(self);
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 error:
10738 if (srelease && sbuf)
10739 PyMem_FREE(sbuf);
10740 if (release1 && buf1)
10741 PyMem_FREE(buf1);
10742 if (release2 && buf2)
10743 PyMem_FREE(buf2);
10744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745}
10746
10747/* --- Unicode Object Methods --------------------------------------------- */
10748
INADA Naoki3ae20562017-01-16 20:41:20 +090010749/*[clinic input]
10750str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
INADA Naoki3ae20562017-01-16 20:41:20 +090010752Return a version of the string where each word is titlecased.
10753
10754More specifically, words start with uppercased characters and all remaining
10755cased characters have lower case.
10756[clinic start generated code]*/
10757
10758static PyObject *
10759unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010760/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010762 if (PyUnicode_READY(self) == -1)
10763 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010764 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765}
10766
INADA Naoki3ae20562017-01-16 20:41:20 +090010767/*[clinic input]
10768str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769
INADA Naoki3ae20562017-01-16 20:41:20 +090010770Return a capitalized version of the string.
10771
10772More specifically, make the first character have upper case and the rest lower
10773case.
10774[clinic start generated code]*/
10775
10776static PyObject *
10777unicode_capitalize_impl(PyObject *self)
10778/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010780 if (PyUnicode_READY(self) == -1)
10781 return NULL;
10782 if (PyUnicode_GET_LENGTH(self) == 0)
10783 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010784 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785}
10786
INADA Naoki3ae20562017-01-16 20:41:20 +090010787/*[clinic input]
10788str.casefold as unicode_casefold
10789
10790Return a version of the string suitable for caseless comparisons.
10791[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792
10793static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010794unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010795/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010796{
10797 if (PyUnicode_READY(self) == -1)
10798 return NULL;
10799 if (PyUnicode_IS_ASCII(self))
10800 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010801 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010802}
10803
10804
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010805/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010806
10807static int
10808convert_uc(PyObject *obj, void *addr)
10809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010811
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010812 if (!PyUnicode_Check(obj)) {
10813 PyErr_Format(PyExc_TypeError,
10814 "The fill character must be a unicode character, "
10815 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 return 0;
10817 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010818 if (PyUnicode_READY(obj) < 0)
10819 return 0;
10820 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010821 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010822 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010823 return 0;
10824 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010825 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010826 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010827}
10828
INADA Naoki3ae20562017-01-16 20:41:20 +090010829/*[clinic input]
10830str.center as unicode_center
10831
10832 width: Py_ssize_t
10833 fillchar: Py_UCS4 = ' '
10834 /
10835
10836Return a centered string of length width.
10837
10838Padding is done using the specified fill character (default is a space).
10839[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
10841static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010842unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10843/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010845 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
Benjamin Petersonbac79492012-01-14 13:34:47 -050010847 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 return NULL;
10849
Victor Stinnerc4b49542011-12-11 22:44:26 +010010850 if (PyUnicode_GET_LENGTH(self) >= width)
10851 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852
Victor Stinnerc4b49542011-12-11 22:44:26 +010010853 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 left = marg / 2 + (marg & width & 1);
10855
Victor Stinner9310abb2011-10-05 00:59:23 +020010856 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857}
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859/* This function assumes that str1 and str2 are readied by the caller. */
10860
Marc-André Lemburge5034372000-08-08 08:04:29 +000010861static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010862unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010863{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010864#define COMPARE(TYPE1, TYPE2) \
10865 do { \
10866 TYPE1* p1 = (TYPE1 *)data1; \
10867 TYPE2* p2 = (TYPE2 *)data2; \
10868 TYPE1* end = p1 + len; \
10869 Py_UCS4 c1, c2; \
10870 for (; p1 != end; p1++, p2++) { \
10871 c1 = *p1; \
10872 c2 = *p2; \
10873 if (c1 != c2) \
10874 return (c1 < c2) ? -1 : 1; \
10875 } \
10876 } \
10877 while (0)
10878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 int kind1, kind2;
10880 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 kind1 = PyUnicode_KIND(str1);
10884 kind2 = PyUnicode_KIND(str2);
10885 data1 = PyUnicode_DATA(str1);
10886 data2 = PyUnicode_DATA(str2);
10887 len1 = PyUnicode_GET_LENGTH(str1);
10888 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010889 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010890
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010891 switch(kind1) {
10892 case PyUnicode_1BYTE_KIND:
10893 {
10894 switch(kind2) {
10895 case PyUnicode_1BYTE_KIND:
10896 {
10897 int cmp = memcmp(data1, data2, len);
10898 /* normalize result of memcmp() into the range [-1; 1] */
10899 if (cmp < 0)
10900 return -1;
10901 if (cmp > 0)
10902 return 1;
10903 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010904 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010905 case PyUnicode_2BYTE_KIND:
10906 COMPARE(Py_UCS1, Py_UCS2);
10907 break;
10908 case PyUnicode_4BYTE_KIND:
10909 COMPARE(Py_UCS1, Py_UCS4);
10910 break;
10911 default:
10912 assert(0);
10913 }
10914 break;
10915 }
10916 case PyUnicode_2BYTE_KIND:
10917 {
10918 switch(kind2) {
10919 case PyUnicode_1BYTE_KIND:
10920 COMPARE(Py_UCS2, Py_UCS1);
10921 break;
10922 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 COMPARE(Py_UCS2, Py_UCS2);
10925 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010926 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010927 case PyUnicode_4BYTE_KIND:
10928 COMPARE(Py_UCS2, Py_UCS4);
10929 break;
10930 default:
10931 assert(0);
10932 }
10933 break;
10934 }
10935 case PyUnicode_4BYTE_KIND:
10936 {
10937 switch(kind2) {
10938 case PyUnicode_1BYTE_KIND:
10939 COMPARE(Py_UCS4, Py_UCS1);
10940 break;
10941 case PyUnicode_2BYTE_KIND:
10942 COMPARE(Py_UCS4, Py_UCS2);
10943 break;
10944 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010945 {
10946#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10947 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10948 /* normalize result of wmemcmp() into the range [-1; 1] */
10949 if (cmp < 0)
10950 return -1;
10951 if (cmp > 0)
10952 return 1;
10953#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010955#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010956 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010957 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010958 default:
10959 assert(0);
10960 }
10961 break;
10962 }
10963 default:
10964 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010965 }
10966
Victor Stinner770e19e2012-10-04 22:59:45 +020010967 if (len1 == len2)
10968 return 0;
10969 if (len1 < len2)
10970 return -1;
10971 else
10972 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010973
10974#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010975}
10976
Benjamin Peterson621b4302016-09-09 13:54:34 -070010977static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010978unicode_compare_eq(PyObject *str1, PyObject *str2)
10979{
10980 int kind;
10981 void *data1, *data2;
10982 Py_ssize_t len;
10983 int cmp;
10984
Victor Stinnere5567ad2012-10-23 02:48:49 +020010985 len = PyUnicode_GET_LENGTH(str1);
10986 if (PyUnicode_GET_LENGTH(str2) != len)
10987 return 0;
10988 kind = PyUnicode_KIND(str1);
10989 if (PyUnicode_KIND(str2) != kind)
10990 return 0;
10991 data1 = PyUnicode_DATA(str1);
10992 data2 = PyUnicode_DATA(str2);
10993
10994 cmp = memcmp(data1, data2, len * kind);
10995 return (cmp == 0);
10996}
10997
10998
Alexander Belopolsky40018472011-02-26 01:02:56 +000010999int
11000PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11003 if (PyUnicode_READY(left) == -1 ||
11004 PyUnicode_READY(right) == -1)
11005 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011006
11007 /* a string is equal to itself */
11008 if (left == right)
11009 return 0;
11010
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011011 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011013 PyErr_Format(PyExc_TypeError,
11014 "Can't compare %.100s and %.100s",
11015 left->ob_type->tp_name,
11016 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 return -1;
11018}
11019
Martin v. Löwis5b222132007-06-10 09:51:05 +000011020int
11021PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 Py_ssize_t i;
11024 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011026 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027
Victor Stinner910337b2011-10-03 03:20:16 +020011028 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011029 if (!PyUnicode_IS_READY(uni)) {
11030 const wchar_t *ws = _PyUnicode_WSTR(uni);
11031 /* Compare Unicode string and source character set string */
11032 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11033 if (chr != ustr[i])
11034 return (chr < ustr[i]) ? -1 : 1;
11035 }
11036 /* This check keeps Python strings that end in '\0' from comparing equal
11037 to C strings identical up to that point. */
11038 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11039 return 1; /* uni is longer */
11040 if (ustr[i])
11041 return -1; /* str is longer */
11042 return 0;
11043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011045 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011046 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011047 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 size_t len, len2 = strlen(str);
11049 int cmp;
11050
11051 len = Py_MIN(len1, len2);
11052 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011053 if (cmp != 0) {
11054 if (cmp < 0)
11055 return -1;
11056 else
11057 return 1;
11058 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 if (len1 > len2)
11060 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011061 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011062 return -1; /* str is longer */
11063 return 0;
11064 }
11065 else {
11066 void *data = PyUnicode_DATA(uni);
11067 /* Compare Unicode string and source character set string */
11068 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011069 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011070 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11071 /* This check keeps Python strings that end in '\0' from comparing equal
11072 to C strings identical up to that point. */
11073 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11074 return 1; /* uni is longer */
11075 if (str[i])
11076 return -1; /* str is longer */
11077 return 0;
11078 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011079}
11080
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011081static int
11082non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11083{
11084 size_t i, len;
11085 const wchar_t *p;
11086 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11087 if (strlen(str) != len)
11088 return 0;
11089 p = _PyUnicode_WSTR(unicode);
11090 assert(p);
11091 for (i = 0; i < len; i++) {
11092 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011093 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011094 return 0;
11095 }
11096 return 1;
11097}
11098
11099int
11100_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11101{
11102 size_t len;
11103 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011104 assert(str);
11105#ifndef NDEBUG
11106 for (const char *p = str; *p; p++) {
11107 assert((unsigned char)*p < 128);
11108 }
11109#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011110 if (PyUnicode_READY(unicode) == -1) {
11111 /* Memory error or bad data */
11112 PyErr_Clear();
11113 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11114 }
11115 if (!PyUnicode_IS_ASCII(unicode))
11116 return 0;
11117 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11118 return strlen(str) == len &&
11119 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11120}
11121
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011122int
11123_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11124{
11125 PyObject *right_uni;
11126 Py_hash_t hash;
11127
11128 assert(_PyUnicode_CHECK(left));
11129 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011130#ifndef NDEBUG
11131 for (const char *p = right->string; *p; p++) {
11132 assert((unsigned char)*p < 128);
11133 }
11134#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011135
11136 if (PyUnicode_READY(left) == -1) {
11137 /* memory error or bad data */
11138 PyErr_Clear();
11139 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11140 }
11141
11142 if (!PyUnicode_IS_ASCII(left))
11143 return 0;
11144
11145 right_uni = _PyUnicode_FromId(right); /* borrowed */
11146 if (right_uni == NULL) {
11147 /* memory error or bad data */
11148 PyErr_Clear();
11149 return _PyUnicode_EqualToASCIIString(left, right->string);
11150 }
11151
11152 if (left == right_uni)
11153 return 1;
11154
11155 if (PyUnicode_CHECK_INTERNED(left))
11156 return 0;
11157
11158 assert(_PyUnicode_HASH(right_uni) != 1);
11159 hash = _PyUnicode_HASH(left);
11160 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11161 return 0;
11162
11163 return unicode_compare_eq(left, right_uni);
11164}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011165
Benjamin Peterson29060642009-01-31 22:14:21 +000011166#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011167 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011168
Alexander Belopolsky40018472011-02-26 01:02:56 +000011169PyObject *
11170PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011171{
11172 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011173 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011174
Victor Stinnere5567ad2012-10-23 02:48:49 +020011175 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11176 Py_RETURN_NOTIMPLEMENTED;
11177
11178 if (PyUnicode_READY(left) == -1 ||
11179 PyUnicode_READY(right) == -1)
11180 return NULL;
11181
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011182 if (left == right) {
11183 switch (op) {
11184 case Py_EQ:
11185 case Py_LE:
11186 case Py_GE:
11187 /* a string is equal to itself */
11188 v = Py_True;
11189 break;
11190 case Py_NE:
11191 case Py_LT:
11192 case Py_GT:
11193 v = Py_False;
11194 break;
11195 default:
11196 PyErr_BadArgument();
11197 return NULL;
11198 }
11199 }
11200 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011201 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011202 result ^= (op == Py_NE);
11203 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011204 }
11205 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011206 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011207
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011208 /* Convert the return value to a Boolean */
11209 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011210 case Py_LE:
11211 v = TEST_COND(result <= 0);
11212 break;
11213 case Py_GE:
11214 v = TEST_COND(result >= 0);
11215 break;
11216 case Py_LT:
11217 v = TEST_COND(result == -1);
11218 break;
11219 case Py_GT:
11220 v = TEST_COND(result == 1);
11221 break;
11222 default:
11223 PyErr_BadArgument();
11224 return NULL;
11225 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011226 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011227 Py_INCREF(v);
11228 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011229}
11230
Alexander Belopolsky40018472011-02-26 01:02:56 +000011231int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011232_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11233{
11234 return unicode_eq(aa, bb);
11235}
11236
11237int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011238PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011239{
Victor Stinner77282cb2013-04-14 19:22:47 +020011240 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 void *buf1, *buf2;
11242 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011243 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011244
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011245 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 "'in <string>' requires string as left operand, not %.100s",
11248 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011249 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011250 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011251 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011253 if (ensure_unicode(str) < 0)
11254 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011257 kind2 = PyUnicode_KIND(substr);
11258 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011259 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011261 len2 = PyUnicode_GET_LENGTH(substr);
11262 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011263 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011264 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011265 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011266 if (len2 == 1) {
11267 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11268 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011269 return result;
11270 }
11271 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 buf2 = _PyUnicode_AsKind(substr, kind1);
11273 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011274 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276
Victor Stinner77282cb2013-04-14 19:22:47 +020011277 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 case PyUnicode_1BYTE_KIND:
11279 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11280 break;
11281 case PyUnicode_2BYTE_KIND:
11282 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11283 break;
11284 case PyUnicode_4BYTE_KIND:
11285 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11286 break;
11287 default:
11288 result = -1;
11289 assert(0);
11290 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011291
Victor Stinner77282cb2013-04-14 19:22:47 +020011292 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 PyMem_Free(buf2);
11294
Guido van Rossum403d68b2000-03-13 15:55:09 +000011295 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011296}
11297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298/* Concat to string or Unicode object giving a new Unicode object. */
11299
Alexander Belopolsky40018472011-02-26 01:02:56 +000011300PyObject *
11301PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011304 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011307 if (ensure_unicode(left) < 0)
11308 return NULL;
11309
11310 if (!PyUnicode_Check(right)) {
11311 PyErr_Format(PyExc_TypeError,
11312 "can only concatenate str (not \"%.200s\") to str",
11313 right->ob_type->tp_name);
11314 return NULL;
11315 }
11316 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011320 if (left == unicode_empty)
11321 return PyUnicode_FromObject(right);
11322 if (right == unicode_empty)
11323 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 left_len = PyUnicode_GET_LENGTH(left);
11326 right_len = PyUnicode_GET_LENGTH(right);
11327 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011328 PyErr_SetString(PyExc_OverflowError,
11329 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011330 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011331 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011332 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011333
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011334 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11335 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011336 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011339 result = PyUnicode_New(new_len, maxchar);
11340 if (result == NULL)
11341 return NULL;
11342 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11343 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11344 assert(_PyUnicode_CheckConsistency(result, 1));
11345 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346}
11347
Walter Dörwald1ab83302007-05-18 17:15:44 +000011348void
Victor Stinner23e56682011-10-03 03:54:37 +020011349PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011350{
Victor Stinner23e56682011-10-03 03:54:37 +020011351 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011352 Py_UCS4 maxchar, maxchar2;
11353 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011354
11355 if (p_left == NULL) {
11356 if (!PyErr_Occurred())
11357 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011358 return;
11359 }
Victor Stinner23e56682011-10-03 03:54:37 +020011360 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011361 if (right == NULL || left == NULL
11362 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011363 if (!PyErr_Occurred())
11364 PyErr_BadInternalCall();
11365 goto error;
11366 }
11367
Benjamin Petersonbac79492012-01-14 13:34:47 -050011368 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011369 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011370 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011371 goto error;
11372
Victor Stinner488fa492011-12-12 00:01:39 +010011373 /* Shortcuts */
11374 if (left == unicode_empty) {
11375 Py_DECREF(left);
11376 Py_INCREF(right);
11377 *p_left = right;
11378 return;
11379 }
11380 if (right == unicode_empty)
11381 return;
11382
11383 left_len = PyUnicode_GET_LENGTH(left);
11384 right_len = PyUnicode_GET_LENGTH(right);
11385 if (left_len > PY_SSIZE_T_MAX - right_len) {
11386 PyErr_SetString(PyExc_OverflowError,
11387 "strings are too large to concat");
11388 goto error;
11389 }
11390 new_len = left_len + right_len;
11391
11392 if (unicode_modifiable(left)
11393 && PyUnicode_CheckExact(right)
11394 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011395 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11396 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011397 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011398 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011399 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11400 {
11401 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011402 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011403 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011404
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011405 /* copy 'right' into the newly allocated area of 'left' */
11406 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011407 }
Victor Stinner488fa492011-12-12 00:01:39 +010011408 else {
11409 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11410 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011411 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011412
Victor Stinner488fa492011-12-12 00:01:39 +010011413 /* Concat the two Unicode strings */
11414 res = PyUnicode_New(new_len, maxchar);
11415 if (res == NULL)
11416 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011417 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11418 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011419 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011420 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011421 }
11422 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011423 return;
11424
11425error:
Victor Stinner488fa492011-12-12 00:01:39 +010011426 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011427}
11428
11429void
11430PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011432 PyUnicode_Append(pleft, right);
11433 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011434}
11435
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011436/*
11437Wraps stringlib_parse_args_finds() and additionally ensures that the
11438first argument is a unicode object.
11439*/
11440
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011441static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011442parse_args_finds_unicode(const char * function_name, PyObject *args,
11443 PyObject **substring,
11444 Py_ssize_t *start, Py_ssize_t *end)
11445{
11446 if(stringlib_parse_args_finds(function_name, args, substring,
11447 start, end)) {
11448 if (ensure_unicode(*substring) < 0)
11449 return 0;
11450 return 1;
11451 }
11452 return 0;
11453}
11454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011459string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011463unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011465 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011466 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011467 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011469 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 void *buf1, *buf2;
11471 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011473 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 kind1 = PyUnicode_KIND(self);
11477 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011478 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011479 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 len1 = PyUnicode_GET_LENGTH(self);
11482 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011484 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011485 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011486
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011487 buf1 = PyUnicode_DATA(self);
11488 buf2 = PyUnicode_DATA(substring);
11489 if (kind2 != kind1) {
11490 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011491 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011492 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011493 }
11494 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 case PyUnicode_1BYTE_KIND:
11496 iresult = ucs1lib_count(
11497 ((Py_UCS1*)buf1) + start, end - start,
11498 buf2, len2, PY_SSIZE_T_MAX
11499 );
11500 break;
11501 case PyUnicode_2BYTE_KIND:
11502 iresult = ucs2lib_count(
11503 ((Py_UCS2*)buf1) + start, end - start,
11504 buf2, len2, PY_SSIZE_T_MAX
11505 );
11506 break;
11507 case PyUnicode_4BYTE_KIND:
11508 iresult = ucs4lib_count(
11509 ((Py_UCS4*)buf1) + start, end - start,
11510 buf2, len2, PY_SSIZE_T_MAX
11511 );
11512 break;
11513 default:
11514 assert(0); iresult = 0;
11515 }
11516
11517 result = PyLong_FromSsize_t(iresult);
11518
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011519 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 return result;
11523}
11524
INADA Naoki3ae20562017-01-16 20:41:20 +090011525/*[clinic input]
11526str.encode as unicode_encode
11527
11528 encoding: str(c_default="NULL") = 'utf-8'
11529 The encoding in which to encode the string.
11530 errors: str(c_default="NULL") = 'strict'
11531 The error handling scheme to use for encoding errors.
11532 The default is 'strict' meaning that encoding errors raise a
11533 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11534 'xmlcharrefreplace' as well as any other name registered with
11535 codecs.register_error that can handle UnicodeEncodeErrors.
11536
11537Encode the string using the codec registered for encoding.
11538[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
11540static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011541unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011542/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011544 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011545}
11546
INADA Naoki3ae20562017-01-16 20:41:20 +090011547/*[clinic input]
11548str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
INADA Naoki3ae20562017-01-16 20:41:20 +090011550 tabsize: int = 8
11551
11552Return a copy where all tab characters are expanded using spaces.
11553
11554If tabsize is not given, a tab size of 8 characters is assumed.
11555[clinic start generated code]*/
11556
11557static PyObject *
11558unicode_expandtabs_impl(PyObject *self, int tabsize)
11559/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 Py_ssize_t i, j, line_pos, src_len, incr;
11562 Py_UCS4 ch;
11563 PyObject *u;
11564 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011566 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
Antoine Pitrou22425222011-10-04 19:10:51 +020011568 if (PyUnicode_READY(self) == -1)
11569 return NULL;
11570
Thomas Wouters7e474022000-07-16 12:04:32 +000011571 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 src_len = PyUnicode_GET_LENGTH(self);
11573 i = j = line_pos = 0;
11574 kind = PyUnicode_KIND(self);
11575 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011576 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011577 for (; i < src_len; i++) {
11578 ch = PyUnicode_READ(kind, src_data, i);
11579 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011580 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 goto overflow;
11585 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011587 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 goto overflow;
11592 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 if (ch == '\n' || ch == '\r')
11595 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011598 if (!found)
11599 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011600
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011602 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 if (!u)
11604 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011605 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
Antoine Pitroue71d5742011-10-04 15:55:09 +020011607 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 for (; i < src_len; i++) {
11610 ch = PyUnicode_READ(kind, src_data, i);
11611 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011613 incr = tabsize - (line_pos % tabsize);
11614 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011615 FILL(kind, dest_data, ' ', j, incr);
11616 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011618 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 line_pos++;
11621 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011622 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011623 if (ch == '\n' || ch == '\r')
11624 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011626 }
11627 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011628 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011629
Antoine Pitroue71d5742011-10-04 15:55:09 +020011630 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011631 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637\n\
11638Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011639such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640arguments start and end are interpreted as in slice notation.\n\
11641\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
11644static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011647 /* initialize variables to prevent gcc warning */
11648 PyObject *substring = NULL;
11649 Py_ssize_t start = 0;
11650 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011653 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011656 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011659 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (result == -2)
11662 return NULL;
11663
Christian Heimes217cfd12007-12-02 14:31:20 +000011664 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665}
11666
11667static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011668unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011670 void *data;
11671 enum PyUnicode_Kind kind;
11672 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011673
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011674 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011675 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011677 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011678 if (PyUnicode_READY(self) == -1) {
11679 return NULL;
11680 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011681 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11682 PyErr_SetString(PyExc_IndexError, "string index out of range");
11683 return NULL;
11684 }
11685 kind = PyUnicode_KIND(self);
11686 data = PyUnicode_DATA(self);
11687 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011688 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
Guido van Rossumc2504932007-09-18 19:42:40 +000011691/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011692 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011693static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011694unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695{
Guido van Rossumc2504932007-09-18 19:42:40 +000011696 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011697 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011698
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011699#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011700 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011701#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 if (_PyUnicode_HASH(self) != -1)
11703 return _PyUnicode_HASH(self);
11704 if (PyUnicode_READY(self) == -1)
11705 return -1;
11706 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011707 /*
11708 We make the hash of the empty string be 0, rather than using
11709 (prefix ^ suffix), since this slightly obfuscates the hash secret
11710 */
11711 if (len == 0) {
11712 _PyUnicode_HASH(self) = 0;
11713 return 0;
11714 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011715 x = _Py_HashBytes(PyUnicode_DATA(self),
11716 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011718 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719}
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011724Return the lowest index in S where substring sub is found, \n\
11725such that sub is contained within S[start:end]. Optional\n\
11726arguments start and end are interpreted as in slice notation.\n\
11727\n\
11728Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011733 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011734 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011735 PyObject *substring = NULL;
11736 Py_ssize_t start = 0;
11737 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011739 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011742 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011745 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (result == -2)
11748 return NULL;
11749
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 if (result < 0) {
11751 PyErr_SetString(PyExc_ValueError, "substring not found");
11752 return NULL;
11753 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011754
Christian Heimes217cfd12007-12-02 14:31:20 +000011755 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756}
11757
INADA Naoki3ae20562017-01-16 20:41:20 +090011758/*[clinic input]
11759str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760
INADA Naoki3ae20562017-01-16 20:41:20 +090011761Return True if the string is a lowercase string, False otherwise.
11762
11763A string is lowercase if all cased characters in the string are lowercase and
11764there is at least one cased character in the string.
11765[clinic start generated code]*/
11766
11767static PyObject *
11768unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011769/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 Py_ssize_t i, length;
11772 int kind;
11773 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 int cased;
11775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (PyUnicode_READY(self) == -1)
11777 return NULL;
11778 length = PyUnicode_GET_LENGTH(self);
11779 kind = PyUnicode_KIND(self);
11780 data = PyUnicode_DATA(self);
11781
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (length == 1)
11784 return PyBool_FromLong(
11785 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011789 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 for (i = 0; i < length; i++) {
11793 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011794
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011796 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 else if (!cased && Py_UNICODE_ISLOWER(ch))
11798 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011800 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801}
11802
INADA Naoki3ae20562017-01-16 20:41:20 +090011803/*[clinic input]
11804str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
INADA Naoki3ae20562017-01-16 20:41:20 +090011806Return True if the string is an uppercase string, False otherwise.
11807
11808A string is uppercase if all cased characters in the string are uppercase and
11809there is at least one cased character in the string.
11810[clinic start generated code]*/
11811
11812static PyObject *
11813unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011814/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 Py_ssize_t i, length;
11817 int kind;
11818 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 int cased;
11820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (PyUnicode_READY(self) == -1)
11822 return NULL;
11823 length = PyUnicode_GET_LENGTH(self);
11824 kind = PyUnicode_KIND(self);
11825 data = PyUnicode_DATA(self);
11826
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (length == 1)
11829 return PyBool_FromLong(
11830 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011832 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011834 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 for (i = 0; i < length; i++) {
11838 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011839
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011841 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 else if (!cased && Py_UNICODE_ISUPPER(ch))
11843 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011845 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846}
11847
INADA Naoki3ae20562017-01-16 20:41:20 +090011848/*[clinic input]
11849str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
INADA Naoki3ae20562017-01-16 20:41:20 +090011851Return True if the string is a title-cased string, False otherwise.
11852
11853In a title-cased string, upper- and title-case characters may only
11854follow uncased characters and lowercase characters only cased ones.
11855[clinic start generated code]*/
11856
11857static PyObject *
11858unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011859/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 Py_ssize_t i, length;
11862 int kind;
11863 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 int cased, previous_is_cased;
11865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (PyUnicode_READY(self) == -1)
11867 return NULL;
11868 length = PyUnicode_GET_LENGTH(self);
11869 kind = PyUnicode_KIND(self);
11870 data = PyUnicode_DATA(self);
11871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (length == 1) {
11874 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11875 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11876 (Py_UNICODE_ISUPPER(ch) != 0));
11877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011879 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011881 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011882
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 cased = 0;
11884 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 for (i = 0; i < length; i++) {
11886 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011887
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11889 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011890 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 previous_is_cased = 1;
11892 cased = 1;
11893 }
11894 else if (Py_UNICODE_ISLOWER(ch)) {
11895 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 previous_is_cased = 1;
11898 cased = 1;
11899 }
11900 else
11901 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011903 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906/*[clinic input]
11907str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
INADA Naoki3ae20562017-01-16 20:41:20 +090011909Return True if the string is a whitespace string, False otherwise.
11910
11911A string is whitespace if all characters in the string are whitespace and there
11912is at least one character in the string.
11913[clinic start generated code]*/
11914
11915static PyObject *
11916unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011917/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 Py_ssize_t i, length;
11920 int kind;
11921 void *data;
11922
11923 if (PyUnicode_READY(self) == -1)
11924 return NULL;
11925 length = PyUnicode_GET_LENGTH(self);
11926 kind = PyUnicode_KIND(self);
11927 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (length == 1)
11931 return PyBool_FromLong(
11932 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011940 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011941 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944}
11945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946/*[clinic input]
11947str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949Return True if the string is an alphabetic string, False otherwise.
11950
11951A string is alphabetic if all characters in the string are alphabetic and there
11952is at least one character in the string.
11953[clinic start generated code]*/
11954
11955static PyObject *
11956unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011957/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t i, length;
11960 int kind;
11961 void *data;
11962
11963 if (PyUnicode_READY(self) == -1)
11964 return NULL;
11965 length = PyUnicode_GET_LENGTH(self);
11966 kind = PyUnicode_KIND(self);
11967 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (length == 1)
11971 return PyBool_FromLong(
11972 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973
11974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011976 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 for (i = 0; i < length; i++) {
11979 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011983}
11984
INADA Naoki3ae20562017-01-16 20:41:20 +090011985/*[clinic input]
11986str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987
INADA Naoki3ae20562017-01-16 20:41:20 +090011988Return True if the string is an alpha-numeric string, False otherwise.
11989
11990A string is alpha-numeric if all characters in the string are alpha-numeric and
11991there is at least one character in the string.
11992[clinic start generated code]*/
11993
11994static PyObject *
11995unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011996/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 int kind;
11999 void *data;
12000 Py_ssize_t len, i;
12001
12002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004
12005 kind = PyUnicode_KIND(self);
12006 data = PyUnicode_DATA(self);
12007 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012008
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012009 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (len == 1) {
12011 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12012 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12013 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012014
12015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 for (i = 0; i < len; i++) {
12020 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012021 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012023 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012024 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012025}
12026
INADA Naoki3ae20562017-01-16 20:41:20 +090012027/*[clinic input]
12028str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
INADA Naoki3ae20562017-01-16 20:41:20 +090012030Return True if the string is a decimal string, False otherwise.
12031
12032A string is a decimal string if all characters in the string are decimal and
12033there is at least one character in the string.
12034[clinic start generated code]*/
12035
12036static PyObject *
12037unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012038/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 Py_ssize_t i, length;
12041 int kind;
12042 void *data;
12043
12044 if (PyUnicode_READY(self) == -1)
12045 return NULL;
12046 length = PyUnicode_GET_LENGTH(self);
12047 kind = PyUnicode_KIND(self);
12048 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 if (length == 1)
12052 return PyBool_FromLong(
12053 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012055 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012057 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 for (i = 0; i < length; i++) {
12060 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
INADA Naoki3ae20562017-01-16 20:41:20 +090012066/*[clinic input]
12067str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
INADA Naoki3ae20562017-01-16 20:41:20 +090012069Return True if the string is a digit string, False otherwise.
12070
12071A string is a digit string if all characters in the string are digits and there
12072is at least one character in the string.
12073[clinic start generated code]*/
12074
12075static PyObject *
12076unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012077/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 Py_ssize_t i, length;
12080 int kind;
12081 void *data;
12082
12083 if (PyUnicode_READY(self) == -1)
12084 return NULL;
12085 length = PyUnicode_GET_LENGTH(self);
12086 kind = PyUnicode_KIND(self);
12087 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (length == 1) {
12091 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12092 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012095 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012097 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 for (i = 0; i < length; i++) {
12100 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012101 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012103 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
INADA Naoki3ae20562017-01-16 20:41:20 +090012106/*[clinic input]
12107str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
INADA Naoki3ae20562017-01-16 20:41:20 +090012109Return True if the string is a numeric string, False otherwise.
12110
12111A string is numeric if all characters in the string are numeric and there is at
12112least one character in the string.
12113[clinic start generated code]*/
12114
12115static PyObject *
12116unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012117/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 Py_ssize_t i, length;
12120 int kind;
12121 void *data;
12122
12123 if (PyUnicode_READY(self) == -1)
12124 return NULL;
12125 length = PyUnicode_GET_LENGTH(self);
12126 kind = PyUnicode_KIND(self);
12127 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (length == 1)
12131 return PyBool_FromLong(
12132 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012134 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012136 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 for (i = 0; i < length; i++) {
12139 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012140 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012142 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143}
12144
Martin v. Löwis47383402007-08-15 07:32:56 +000012145int
12146PyUnicode_IsIdentifier(PyObject *self)
12147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 int kind;
12149 void *data;
12150 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012151 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (PyUnicode_READY(self) == -1) {
12154 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 }
12157
12158 /* Special case for empty strings */
12159 if (PyUnicode_GET_LENGTH(self) == 0)
12160 return 0;
12161 kind = PyUnicode_KIND(self);
12162 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012163
12164 /* PEP 3131 says that the first character must be in
12165 XID_Start and subsequent characters in XID_Continue,
12166 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012168 letters, digits, underscore). However, given the current
12169 definition of XID_Start and XID_Continue, it is sufficient
12170 to check just for these, except that _ must be allowed
12171 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012173 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012174 return 0;
12175
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012176 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012178 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012179 return 1;
12180}
12181
INADA Naoki3ae20562017-01-16 20:41:20 +090012182/*[clinic input]
12183str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012184
INADA Naoki3ae20562017-01-16 20:41:20 +090012185Return True if the string is a valid Python identifier, False otherwise.
12186
12187Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12188"class".
12189[clinic start generated code]*/
12190
12191static PyObject *
12192unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012193/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012194{
12195 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12196}
12197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198/*[clinic input]
12199str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012200
INADA Naoki3ae20562017-01-16 20:41:20 +090012201Return True if the string is printable, False otherwise.
12202
12203A string is printable if all of its characters are considered printable in
12204repr() or if it is empty.
12205[clinic start generated code]*/
12206
12207static PyObject *
12208unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012209/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t i, length;
12212 int kind;
12213 void *data;
12214
12215 if (PyUnicode_READY(self) == -1)
12216 return NULL;
12217 length = PyUnicode_GET_LENGTH(self);
12218 kind = PyUnicode_KIND(self);
12219 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012220
12221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (length == 1)
12223 return PyBool_FromLong(
12224 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 for (i = 0; i < length; i++) {
12227 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012228 Py_RETURN_FALSE;
12229 }
12230 }
12231 Py_RETURN_TRUE;
12232}
12233
INADA Naoki3ae20562017-01-16 20:41:20 +090012234/*[clinic input]
12235str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
INADA Naoki3ae20562017-01-16 20:41:20 +090012237 iterable: object
12238 /
12239
12240Concatenate any number of strings.
12241
Martin Panter91a88662017-01-24 00:30:06 +000012242The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012243The result is returned as a new string.
12244
12245Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12246[clinic start generated code]*/
12247
12248static PyObject *
12249unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012250/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
INADA Naoki3ae20562017-01-16 20:41:20 +090012252 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253}
12254
Martin v. Löwis18e16552006-02-15 17:27:45 +000012255static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012256unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 if (PyUnicode_READY(self) == -1)
12259 return -1;
12260 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263/*[clinic input]
12264str.ljust as unicode_ljust
12265
12266 width: Py_ssize_t
12267 fillchar: Py_UCS4 = ' '
12268 /
12269
12270Return a left-justified string of length width.
12271
12272Padding is done using the specified fill character (default is a space).
12273[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
12275static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012276unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12277/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012279 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
Victor Stinnerc4b49542011-12-11 22:44:26 +010012282 if (PyUnicode_GET_LENGTH(self) >= width)
12283 return unicode_result_unchanged(self);
12284
12285 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286}
12287
INADA Naoki3ae20562017-01-16 20:41:20 +090012288/*[clinic input]
12289str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
INADA Naoki3ae20562017-01-16 20:41:20 +090012291Return a copy of the string converted to lowercase.
12292[clinic start generated code]*/
12293
12294static PyObject *
12295unicode_lower_impl(PyObject *self)
12296/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012298 if (PyUnicode_READY(self) == -1)
12299 return NULL;
12300 if (PyUnicode_IS_ASCII(self))
12301 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012302 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303}
12304
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305#define LEFTSTRIP 0
12306#define RIGHTSTRIP 1
12307#define BOTHSTRIP 2
12308
12309/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012310static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
INADA Naoki3ae20562017-01-16 20:41:20 +090012312#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012313
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012314/* externally visible for str.strip(unicode) */
12315PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012316_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 void *data;
12319 int kind;
12320 Py_ssize_t i, j, len;
12321 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012322 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12325 return NULL;
12326
12327 kind = PyUnicode_KIND(self);
12328 data = PyUnicode_DATA(self);
12329 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012330 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12332 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012333 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012334
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 i = 0;
12336 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012337 while (i < len) {
12338 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12339 if (!BLOOM(sepmask, ch))
12340 break;
12341 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12342 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 i++;
12344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012345 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012346
Benjamin Peterson14339b62009-01-31 16:36:08 +000012347 j = len;
12348 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012349 j--;
12350 while (j >= i) {
12351 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12352 if (!BLOOM(sepmask, ch))
12353 break;
12354 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12355 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012357 }
12358
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012360 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361
Victor Stinner7931d9a2011-11-04 00:22:48 +010012362 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363}
12364
12365PyObject*
12366PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12367{
12368 unsigned char *data;
12369 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012370 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
Victor Stinnerde636f32011-10-01 03:55:54 +020012372 if (PyUnicode_READY(self) == -1)
12373 return NULL;
12374
Victor Stinner684d5fd2012-05-03 02:32:34 +020012375 length = PyUnicode_GET_LENGTH(self);
12376 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012377
Victor Stinner684d5fd2012-05-03 02:32:34 +020012378 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012379 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380
Victor Stinnerde636f32011-10-01 03:55:54 +020012381 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012382 PyErr_SetString(PyExc_IndexError, "string index out of range");
12383 return NULL;
12384 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012385 if (start >= length || end < start)
12386 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012387
Victor Stinner684d5fd2012-05-03 02:32:34 +020012388 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012389 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012390 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012391 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012392 }
12393 else {
12394 kind = PyUnicode_KIND(self);
12395 data = PyUnicode_1BYTE_DATA(self);
12396 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012397 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012398 length);
12399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401
12402static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012403do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 Py_ssize_t len, i, j;
12406
12407 if (PyUnicode_READY(self) == -1)
12408 return NULL;
12409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012411
Victor Stinnercc7af722013-04-09 22:39:24 +020012412 if (PyUnicode_IS_ASCII(self)) {
12413 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12414
12415 i = 0;
12416 if (striptype != RIGHTSTRIP) {
12417 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012418 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012419 if (!_Py_ascii_whitespace[ch])
12420 break;
12421 i++;
12422 }
12423 }
12424
12425 j = len;
12426 if (striptype != LEFTSTRIP) {
12427 j--;
12428 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012429 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012430 if (!_Py_ascii_whitespace[ch])
12431 break;
12432 j--;
12433 }
12434 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 }
12436 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012437 else {
12438 int kind = PyUnicode_KIND(self);
12439 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440
Victor Stinnercc7af722013-04-09 22:39:24 +020012441 i = 0;
12442 if (striptype != RIGHTSTRIP) {
12443 while (i < len) {
12444 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12445 if (!Py_UNICODE_ISSPACE(ch))
12446 break;
12447 i++;
12448 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012449 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012450
12451 j = len;
12452 if (striptype != LEFTSTRIP) {
12453 j--;
12454 while (j >= i) {
12455 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12456 if (!Py_UNICODE_ISSPACE(ch))
12457 break;
12458 j--;
12459 }
12460 j++;
12461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463
Victor Stinner7931d9a2011-11-04 00:22:48 +010012464 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465}
12466
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467
12468static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012469do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012470{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 if (sep != NULL && sep != Py_None) {
12472 if (PyUnicode_Check(sep))
12473 return _PyUnicode_XStrip(self, striptype, sep);
12474 else {
12475 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 "%s arg must be None or str",
12477 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 return NULL;
12479 }
12480 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481
Benjamin Peterson14339b62009-01-31 16:36:08 +000012482 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483}
12484
12485
INADA Naoki3ae20562017-01-16 20:41:20 +090012486/*[clinic input]
12487str.strip as unicode_strip
12488
12489 chars: object = None
12490 /
12491
Victor Stinner0c4a8282017-01-17 02:21:47 +010012492Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012493
12494If chars is given and not None, remove characters in chars instead.
12495[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496
12497static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012498unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012499/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500{
INADA Naoki3ae20562017-01-16 20:41:20 +090012501 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502}
12503
12504
INADA Naoki3ae20562017-01-16 20:41:20 +090012505/*[clinic input]
12506str.lstrip as unicode_lstrip
12507
12508 chars: object = NULL
12509 /
12510
12511Return a copy of the string with leading whitespace removed.
12512
12513If chars is given and not None, remove characters in chars instead.
12514[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012515
12516static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012517unicode_lstrip_impl(PyObject *self, PyObject *chars)
12518/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012519{
INADA Naoki3ae20562017-01-16 20:41:20 +090012520 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012521}
12522
12523
INADA Naoki3ae20562017-01-16 20:41:20 +090012524/*[clinic input]
12525str.rstrip as unicode_rstrip
12526
12527 chars: object = NULL
12528 /
12529
12530Return a copy of the string with trailing whitespace removed.
12531
12532If chars is given and not None, remove characters in chars instead.
12533[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012534
12535static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012536unicode_rstrip_impl(PyObject *self, PyObject *chars)
12537/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538{
INADA Naoki3ae20562017-01-16 20:41:20 +090012539 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012540}
12541
12542
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012544unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012546 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548
Serhiy Storchaka05997252013-01-26 12:14:02 +020012549 if (len < 1)
12550 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
Victor Stinnerc4b49542011-12-11 22:44:26 +010012552 /* no repeat, return original string */
12553 if (len == 1)
12554 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012555
Benjamin Petersonbac79492012-01-14 13:34:47 -050012556 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 return NULL;
12558
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012559 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012560 PyErr_SetString(PyExc_OverflowError,
12561 "repeated string is too long");
12562 return NULL;
12563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012565
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012566 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567 if (!u)
12568 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012569 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 if (PyUnicode_GET_LENGTH(str) == 1) {
12572 const int kind = PyUnicode_KIND(str);
12573 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012574 if (kind == PyUnicode_1BYTE_KIND) {
12575 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012576 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012577 }
12578 else if (kind == PyUnicode_2BYTE_KIND) {
12579 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012580 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012581 ucs2[n] = fill_char;
12582 } else {
12583 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12584 assert(kind == PyUnicode_4BYTE_KIND);
12585 for (n = 0; n < len; ++n)
12586 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 }
12589 else {
12590 /* number of characters copied this far */
12591 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012592 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012594 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012598 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601 }
12602
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012603 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012604 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605}
12606
Alexander Belopolsky40018472011-02-26 01:02:56 +000012607PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012608PyUnicode_Replace(PyObject *str,
12609 PyObject *substr,
12610 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012611 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012613 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12614 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012616 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617}
12618
INADA Naoki3ae20562017-01-16 20:41:20 +090012619/*[clinic input]
12620str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
INADA Naoki3ae20562017-01-16 20:41:20 +090012622 old: unicode
12623 new: unicode
12624 count: Py_ssize_t = -1
12625 Maximum number of occurrences to replace.
12626 -1 (the default value) means replace all occurrences.
12627 /
12628
12629Return a copy with all occurrences of substring old replaced by new.
12630
12631If the optional argument count is given, only the first count occurrences are
12632replaced.
12633[clinic start generated code]*/
12634
12635static PyObject *
12636unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12637 Py_ssize_t count)
12638/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012640 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012642 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
Alexander Belopolsky40018472011-02-26 01:02:56 +000012645static PyObject *
12646unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012648 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 Py_ssize_t isize;
12650 Py_ssize_t osize, squote, dquote, i, o;
12651 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012652 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012656 return NULL;
12657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 isize = PyUnicode_GET_LENGTH(unicode);
12659 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 /* Compute length of output, quote characters, and
12662 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012663 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 max = 127;
12665 squote = dquote = 0;
12666 ikind = PyUnicode_KIND(unicode);
12667 for (i = 0; i < isize; i++) {
12668 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 case '\'': squote++; break;
12672 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012674 incr = 2;
12675 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 default:
12677 /* Fast-path ASCII */
12678 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012679 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012681 ;
12682 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012685 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012687 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012689 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012691 if (osize > PY_SSIZE_T_MAX - incr) {
12692 PyErr_SetString(PyExc_OverflowError,
12693 "string is too long to generate repr");
12694 return NULL;
12695 }
12696 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 }
12698
12699 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012700 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012702 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 if (dquote)
12704 /* Both squote and dquote present. Use squote,
12705 and escape them */
12706 osize += squote;
12707 else
12708 quote = '"';
12709 }
Victor Stinner55c08782013-04-14 18:45:39 +020012710 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711
12712 repr = PyUnicode_New(osize, max);
12713 if (repr == NULL)
12714 return NULL;
12715 okind = PyUnicode_KIND(repr);
12716 odata = PyUnicode_DATA(repr);
12717
12718 PyUnicode_WRITE(okind, odata, 0, quote);
12719 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012720 if (unchanged) {
12721 _PyUnicode_FastCopyCharacters(repr, 1,
12722 unicode, 0,
12723 isize);
12724 }
12725 else {
12726 for (i = 0, o = 1; i < isize; i++) {
12727 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728
Victor Stinner55c08782013-04-14 18:45:39 +020012729 /* Escape quotes and backslashes */
12730 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012731 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012733 continue;
12734 }
12735
12736 /* Map special whitespace to '\t', \n', '\r' */
12737 if (ch == '\t') {
12738 PyUnicode_WRITE(okind, odata, o++, '\\');
12739 PyUnicode_WRITE(okind, odata, o++, 't');
12740 }
12741 else if (ch == '\n') {
12742 PyUnicode_WRITE(okind, odata, o++, '\\');
12743 PyUnicode_WRITE(okind, odata, o++, 'n');
12744 }
12745 else if (ch == '\r') {
12746 PyUnicode_WRITE(okind, odata, o++, '\\');
12747 PyUnicode_WRITE(okind, odata, o++, 'r');
12748 }
12749
12750 /* Map non-printable US ASCII to '\xhh' */
12751 else if (ch < ' ' || ch == 0x7F) {
12752 PyUnicode_WRITE(okind, odata, o++, '\\');
12753 PyUnicode_WRITE(okind, odata, o++, 'x');
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12756 }
12757
12758 /* Copy ASCII characters as-is */
12759 else if (ch < 0x7F) {
12760 PyUnicode_WRITE(okind, odata, o++, ch);
12761 }
12762
12763 /* Non-ASCII characters */
12764 else {
12765 /* Map Unicode whitespace and control characters
12766 (categories Z* and C* except ASCII space)
12767 */
12768 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12769 PyUnicode_WRITE(okind, odata, o++, '\\');
12770 /* Map 8-bit characters to '\xhh' */
12771 if (ch <= 0xff) {
12772 PyUnicode_WRITE(okind, odata, o++, 'x');
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12775 }
12776 /* Map 16-bit characters to '\uxxxx' */
12777 else if (ch <= 0xffff) {
12778 PyUnicode_WRITE(okind, odata, o++, 'u');
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12783 }
12784 /* Map 21-bit characters to '\U00xxxxxx' */
12785 else {
12786 PyUnicode_WRITE(okind, odata, o++, 'U');
12787 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12788 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12789 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12790 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12791 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12792 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12793 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12794 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12795 }
12796 }
12797 /* Copy characters as-is */
12798 else {
12799 PyUnicode_WRITE(okind, odata, o++, ch);
12800 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012801 }
12802 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012805 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012806 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807}
12808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811\n\
12812Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012813such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814arguments start and end are interpreted as in slice notation.\n\
12815\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012816Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012821 /* initialize variables to prevent gcc warning */
12822 PyObject *substring = NULL;
12823 Py_ssize_t start = 0;
12824 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012827 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 if (result == -2)
12836 return NULL;
12837
Christian Heimes217cfd12007-12-02 14:31:20 +000012838 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839}
12840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012841PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012844Return the highest index in S where substring sub is found,\n\
12845such that sub is contained within S[start:end]. Optional\n\
12846arguments start and end are interpreted as in slice notation.\n\
12847\n\
12848Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849
12850static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012853 /* initialize variables to prevent gcc warning */
12854 PyObject *substring = NULL;
12855 Py_ssize_t start = 0;
12856 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012857 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012859 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012862 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012865 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 if (result == -2)
12868 return NULL;
12869
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870 if (result < 0) {
12871 PyErr_SetString(PyExc_ValueError, "substring not found");
12872 return NULL;
12873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874
Christian Heimes217cfd12007-12-02 14:31:20 +000012875 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876}
12877
INADA Naoki3ae20562017-01-16 20:41:20 +090012878/*[clinic input]
12879str.rjust as unicode_rjust
12880
12881 width: Py_ssize_t
12882 fillchar: Py_UCS4 = ' '
12883 /
12884
12885Return a right-justified string of length width.
12886
12887Padding is done using the specified fill character (default is a space).
12888[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
12890static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012891unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12892/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012894 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895 return NULL;
12896
Victor Stinnerc4b49542011-12-11 22:44:26 +010012897 if (PyUnicode_GET_LENGTH(self) >= width)
12898 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
Victor Stinnerc4b49542011-12-11 22:44:26 +010012900 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901}
12902
Alexander Belopolsky40018472011-02-26 01:02:56 +000012903PyObject *
12904PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012906 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012909 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910}
12911
INADA Naoki3ae20562017-01-16 20:41:20 +090012912/*[clinic input]
12913str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914
INADA Naoki3ae20562017-01-16 20:41:20 +090012915 sep: object = None
12916 The delimiter according which to split the string.
12917 None (the default value) means split according to any whitespace,
12918 and discard empty strings from the result.
12919 maxsplit: Py_ssize_t = -1
12920 Maximum number of splits to do.
12921 -1 (the default value) means no limit.
12922
12923Return a list of the words in the string, using sep as the delimiter string.
12924[clinic start generated code]*/
12925
12926static PyObject *
12927unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12928/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929{
INADA Naoki3ae20562017-01-16 20:41:20 +090012930 if (sep == Py_None)
12931 return split(self, NULL, maxsplit);
12932 if (PyUnicode_Check(sep))
12933 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012934
12935 PyErr_Format(PyExc_TypeError,
12936 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012937 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939}
12940
Thomas Wouters477c8d52006-05-27 19:21:47 +000012941PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012942PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012943{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012944 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012945 int kind1, kind2;
12946 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012949 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951
Victor Stinner14f8f022011-10-05 20:58:25 +020012952 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 len1 = PyUnicode_GET_LENGTH(str_obj);
12955 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 if (kind1 < kind2 || len1 < len2) {
12957 _Py_INCREF_UNICODE_EMPTY();
12958 if (!unicode_empty)
12959 out = NULL;
12960 else {
12961 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12962 Py_DECREF(unicode_empty);
12963 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 return out;
12965 }
12966 buf1 = PyUnicode_DATA(str_obj);
12967 buf2 = PyUnicode_DATA(sep_obj);
12968 if (kind2 != kind1) {
12969 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12970 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012971 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012974 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012976 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12977 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12978 else
12979 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 break;
12981 case PyUnicode_2BYTE_KIND:
12982 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12983 break;
12984 case PyUnicode_4BYTE_KIND:
12985 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12986 break;
12987 default:
12988 assert(0);
12989 out = 0;
12990 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012992 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994
12995 return out;
12996}
12997
12998
12999PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013000PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013001{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013003 int kind1, kind2;
13004 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013006
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013007 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 len1 = PyUnicode_GET_LENGTH(str_obj);
13013 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013014 if (kind1 < kind2 || len1 < len2) {
13015 _Py_INCREF_UNICODE_EMPTY();
13016 if (!unicode_empty)
13017 out = NULL;
13018 else {
13019 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13020 Py_DECREF(unicode_empty);
13021 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 return out;
13023 }
13024 buf1 = PyUnicode_DATA(str_obj);
13025 buf2 = PyUnicode_DATA(sep_obj);
13026 if (kind2 != kind1) {
13027 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13028 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013029 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013032 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013034 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13035 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13036 else
13037 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 break;
13039 case PyUnicode_2BYTE_KIND:
13040 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13041 break;
13042 case PyUnicode_4BYTE_KIND:
13043 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13044 break;
13045 default:
13046 assert(0);
13047 out = 0;
13048 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013050 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
13053 return out;
13054}
13055
INADA Naoki3ae20562017-01-16 20:41:20 +090013056/*[clinic input]
13057str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013058
INADA Naoki3ae20562017-01-16 20:41:20 +090013059 sep: object
13060 /
13061
13062Partition the string into three parts using the given separator.
13063
13064This will search for the separator in the string. If the separator is found,
13065returns a 3-tuple containing the part before the separator, the separator
13066itself, and the part after it.
13067
13068If the separator is not found, returns a 3-tuple containing the original string
13069and two empty strings.
13070[clinic start generated code]*/
13071
13072static PyObject *
13073unicode_partition(PyObject *self, PyObject *sep)
13074/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013075{
INADA Naoki3ae20562017-01-16 20:41:20 +090013076 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013077}
13078
INADA Naoki3ae20562017-01-16 20:41:20 +090013079/*[clinic input]
13080str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013081
INADA Naoki3ae20562017-01-16 20:41:20 +090013082Partition the string into three parts using the given separator.
13083
13084This will search for the separator in the string, starting and the end. If
13085the separator is found, returns a 3-tuple containing the part before the
13086separator, the separator itself, and the part after it.
13087
13088If the separator is not found, returns a 3-tuple containing two empty strings
13089and the original string.
13090[clinic start generated code]*/
13091
13092static PyObject *
13093unicode_rpartition(PyObject *self, PyObject *sep)
13094/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013095{
INADA Naoki3ae20562017-01-16 20:41:20 +090013096 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013097}
13098
Alexander Belopolsky40018472011-02-26 01:02:56 +000013099PyObject *
13100PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013101{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013102 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013103 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013104
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013105 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013106}
13107
INADA Naoki3ae20562017-01-16 20:41:20 +090013108/*[clinic input]
13109str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013110
INADA Naoki3ae20562017-01-16 20:41:20 +090013111Return a list of the words in the string, using sep as the delimiter string.
13112
13113Splits are done starting at the end of the string and working to the front.
13114[clinic start generated code]*/
13115
13116static PyObject *
13117unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13118/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013119{
INADA Naoki3ae20562017-01-16 20:41:20 +090013120 if (sep == Py_None)
13121 return rsplit(self, NULL, maxsplit);
13122 if (PyUnicode_Check(sep))
13123 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013124
13125 PyErr_Format(PyExc_TypeError,
13126 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013127 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013128 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013129}
13130
INADA Naoki3ae20562017-01-16 20:41:20 +090013131/*[clinic input]
13132str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013134 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013135
13136Return a list of the lines in the string, breaking at line boundaries.
13137
13138Line breaks are not included in the resulting list unless keepends is given and
13139true.
13140[clinic start generated code]*/
13141
13142static PyObject *
13143unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013144/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013146 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147}
13148
13149static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013150PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013152 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153}
13154
INADA Naoki3ae20562017-01-16 20:41:20 +090013155/*[clinic input]
13156str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157
INADA Naoki3ae20562017-01-16 20:41:20 +090013158Convert uppercase characters to lowercase and lowercase characters to uppercase.
13159[clinic start generated code]*/
13160
13161static PyObject *
13162unicode_swapcase_impl(PyObject *self)
13163/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013165 if (PyUnicode_READY(self) == -1)
13166 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013167 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168}
13169
Larry Hastings61272b72014-01-07 12:41:53 -080013170/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013171
Larry Hastings31826802013-10-19 00:09:25 -070013172@staticmethod
13173str.maketrans as unicode_maketrans
13174
13175 x: object
13176
13177 y: unicode=NULL
13178
13179 z: unicode=NULL
13180
13181 /
13182
13183Return a translation table usable for str.translate().
13184
13185If there is only one argument, it must be a dictionary mapping Unicode
13186ordinals (integers) or characters to Unicode ordinals, strings or None.
13187Character keys will be then converted to ordinals.
13188If there are two arguments, they must be strings of equal length, and
13189in the resulting dictionary, each character in x will be mapped to the
13190character at the same position in y. If there is a third argument, it
13191must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013192[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013193
Larry Hastings31826802013-10-19 00:09:25 -070013194static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013195unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013196/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013197{
Georg Brandlceee0772007-11-27 23:48:05 +000013198 PyObject *new = NULL, *key, *value;
13199 Py_ssize_t i = 0;
13200 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201
Georg Brandlceee0772007-11-27 23:48:05 +000013202 new = PyDict_New();
13203 if (!new)
13204 return NULL;
13205 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 int x_kind, y_kind, z_kind;
13207 void *x_data, *y_data, *z_data;
13208
Georg Brandlceee0772007-11-27 23:48:05 +000013209 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013210 if (!PyUnicode_Check(x)) {
13211 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13212 "be a string if there is a second argument");
13213 goto err;
13214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013216 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13217 "arguments must have equal length");
13218 goto err;
13219 }
13220 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 x_kind = PyUnicode_KIND(x);
13222 y_kind = PyUnicode_KIND(y);
13223 x_data = PyUnicode_DATA(x);
13224 y_data = PyUnicode_DATA(y);
13225 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13226 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013227 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013228 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013229 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013230 if (!value) {
13231 Py_DECREF(key);
13232 goto err;
13233 }
Georg Brandlceee0772007-11-27 23:48:05 +000013234 res = PyDict_SetItem(new, key, value);
13235 Py_DECREF(key);
13236 Py_DECREF(value);
13237 if (res < 0)
13238 goto err;
13239 }
13240 /* create entries for deleting chars in z */
13241 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 z_kind = PyUnicode_KIND(z);
13243 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013244 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013246 if (!key)
13247 goto err;
13248 res = PyDict_SetItem(new, key, Py_None);
13249 Py_DECREF(key);
13250 if (res < 0)
13251 goto err;
13252 }
13253 }
13254 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 int kind;
13256 void *data;
13257
Georg Brandlceee0772007-11-27 23:48:05 +000013258 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013259 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013260 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13261 "to maketrans it must be a dict");
13262 goto err;
13263 }
13264 /* copy entries into the new dict, converting string keys to int keys */
13265 while (PyDict_Next(x, &i, &key, &value)) {
13266 if (PyUnicode_Check(key)) {
13267 /* convert string keys to integer keys */
13268 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013269 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013270 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13271 "table must be of length 1");
13272 goto err;
13273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 kind = PyUnicode_KIND(key);
13275 data = PyUnicode_DATA(key);
13276 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013277 if (!newkey)
13278 goto err;
13279 res = PyDict_SetItem(new, newkey, value);
13280 Py_DECREF(newkey);
13281 if (res < 0)
13282 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013283 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013284 /* just keep integer keys */
13285 if (PyDict_SetItem(new, key, value) < 0)
13286 goto err;
13287 } else {
13288 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13289 "be strings or integers");
13290 goto err;
13291 }
13292 }
13293 }
13294 return new;
13295 err:
13296 Py_DECREF(new);
13297 return NULL;
13298}
13299
INADA Naoki3ae20562017-01-16 20:41:20 +090013300/*[clinic input]
13301str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
INADA Naoki3ae20562017-01-16 20:41:20 +090013303 table: object
13304 Translation table, which must be a mapping of Unicode ordinals to
13305 Unicode ordinals, strings, or None.
13306 /
13307
13308Replace each character in the string using the given translation table.
13309
13310The table must implement lookup/indexing via __getitem__, for instance a
13311dictionary or list. If this operation raises LookupError, the character is
13312left untouched. Characters mapped to None are deleted.
13313[clinic start generated code]*/
13314
13315static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013317/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320}
13321
INADA Naoki3ae20562017-01-16 20:41:20 +090013322/*[clinic input]
13323str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324
INADA Naoki3ae20562017-01-16 20:41:20 +090013325Return a copy of the string converted to uppercase.
13326[clinic start generated code]*/
13327
13328static PyObject *
13329unicode_upper_impl(PyObject *self)
13330/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013332 if (PyUnicode_READY(self) == -1)
13333 return NULL;
13334 if (PyUnicode_IS_ASCII(self))
13335 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013336 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337}
13338
INADA Naoki3ae20562017-01-16 20:41:20 +090013339/*[clinic input]
13340str.zfill as unicode_zfill
13341
13342 width: Py_ssize_t
13343 /
13344
13345Pad a numeric string with zeros on the left, to fill a field of the given width.
13346
13347The string is never truncated.
13348[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349
13350static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013351unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013352/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013354 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013355 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 int kind;
13357 void *data;
13358 Py_UCS4 chr;
13359
Benjamin Petersonbac79492012-01-14 13:34:47 -050013360 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
Victor Stinnerc4b49542011-12-11 22:44:26 +010013363 if (PyUnicode_GET_LENGTH(self) >= width)
13364 return unicode_result_unchanged(self);
13365
13366 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367
13368 u = pad(self, fill, 0, '0');
13369
Walter Dörwald068325e2002-04-15 13:36:47 +000013370 if (u == NULL)
13371 return NULL;
13372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373 kind = PyUnicode_KIND(u);
13374 data = PyUnicode_DATA(u);
13375 chr = PyUnicode_READ(kind, data, fill);
13376
13377 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 PyUnicode_WRITE(kind, data, 0, chr);
13380 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381 }
13382
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013383 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013384 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013385}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386
13387#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013388static PyObject *
13389unicode__decimal2ascii(PyObject *self)
13390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013392}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393#endif
13394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013395PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013398Return True if S starts with the specified prefix, False otherwise.\n\
13399With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400With optional end, stop comparing S at that position.\n\
13401prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402
13403static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013404unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013408 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013409 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013410 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013411 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412
Jesus Ceaac451502011-04-20 17:09:23 +020013413 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 if (PyTuple_Check(subobj)) {
13416 Py_ssize_t i;
13417 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013418 substring = PyTuple_GET_ITEM(subobj, i);
13419 if (!PyUnicode_Check(substring)) {
13420 PyErr_Format(PyExc_TypeError,
13421 "tuple for startswith must only contain str, "
13422 "not %.100s",
13423 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013424 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013425 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013426 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013427 if (result == -1)
13428 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013429 if (result) {
13430 Py_RETURN_TRUE;
13431 }
13432 }
13433 /* nothing matched */
13434 Py_RETURN_FALSE;
13435 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013436 if (!PyUnicode_Check(subobj)) {
13437 PyErr_Format(PyExc_TypeError,
13438 "startswith first arg must be str or "
13439 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013441 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013442 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013443 if (result == -1)
13444 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013445 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446}
13447
13448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013449PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013452Return True if S ends with the specified suffix, False otherwise.\n\
13453With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013454With optional end, stop comparing S at that position.\n\
13455suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456
13457static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013458unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013461 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013462 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013463 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013464 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013465 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466
Jesus Ceaac451502011-04-20 17:09:23 +020013467 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013469 if (PyTuple_Check(subobj)) {
13470 Py_ssize_t i;
13471 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013472 substring = PyTuple_GET_ITEM(subobj, i);
13473 if (!PyUnicode_Check(substring)) {
13474 PyErr_Format(PyExc_TypeError,
13475 "tuple for endswith must only contain str, "
13476 "not %.100s",
13477 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013479 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013480 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013481 if (result == -1)
13482 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013483 if (result) {
13484 Py_RETURN_TRUE;
13485 }
13486 }
13487 Py_RETURN_FALSE;
13488 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013489 if (!PyUnicode_Check(subobj)) {
13490 PyErr_Format(PyExc_TypeError,
13491 "endswith first arg must be str or "
13492 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013494 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013495 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013496 if (result == -1)
13497 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013498 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013499}
13500
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013501static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013502_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013503{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013504 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13505 writer->data = PyUnicode_DATA(writer->buffer);
13506
13507 if (!writer->readonly) {
13508 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013509 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013510 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013511 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013512 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13513 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13514 writer->kind = PyUnicode_WCHAR_KIND;
13515 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13516
Victor Stinner8f674cc2013-04-17 23:02:17 +020013517 /* Copy-on-write mode: set buffer size to 0 so
13518 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13519 * next write. */
13520 writer->size = 0;
13521 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013522}
13523
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013525_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013526{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013528
13529 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013530 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013531
13532 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13533 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13534 writer->kind = PyUnicode_WCHAR_KIND;
13535 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013536}
13537
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538int
13539_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13540 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013541{
13542 Py_ssize_t newlen;
13543 PyObject *newbuffer;
13544
Victor Stinner2740e462016-09-06 16:58:36 -070013545 assert(maxchar <= MAX_UNICODE);
13546
Victor Stinnerca9381e2015-09-22 00:58:32 +020013547 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013548 assert((maxchar > writer->maxchar && length >= 0)
13549 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013550
Victor Stinner202fdca2012-05-07 12:47:02 +020013551 if (length > PY_SSIZE_T_MAX - writer->pos) {
13552 PyErr_NoMemory();
13553 return -1;
13554 }
13555 newlen = writer->pos + length;
13556
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013557 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013558
Victor Stinnerd3f08822012-05-29 12:57:52 +020013559 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013560 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013561 if (writer->overallocate
13562 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13563 /* overallocate to limit the number of realloc() */
13564 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013566 if (newlen < writer->min_length)
13567 newlen = writer->min_length;
13568
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569 writer->buffer = PyUnicode_New(newlen, maxchar);
13570 if (writer->buffer == NULL)
13571 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013573 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013574 if (writer->overallocate
13575 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13576 /* overallocate to limit the number of realloc() */
13577 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013578 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013579 if (newlen < writer->min_length)
13580 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013582 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013583 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013584 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 newbuffer = PyUnicode_New(newlen, maxchar);
13586 if (newbuffer == NULL)
13587 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13589 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013590 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013591 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013592 }
13593 else {
13594 newbuffer = resize_compact(writer->buffer, newlen);
13595 if (newbuffer == NULL)
13596 return -1;
13597 }
13598 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013599 }
13600 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013601 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013602 newbuffer = PyUnicode_New(writer->size, maxchar);
13603 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013604 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013605 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13606 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013607 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013608 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013609 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013610 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013611
13612#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013613}
13614
Victor Stinnerca9381e2015-09-22 00:58:32 +020013615int
13616_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13617 enum PyUnicode_Kind kind)
13618{
13619 Py_UCS4 maxchar;
13620
13621 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13622 assert(writer->kind < kind);
13623
13624 switch (kind)
13625 {
13626 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13627 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13628 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13629 default:
13630 assert(0 && "invalid kind");
13631 return -1;
13632 }
13633
13634 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13635}
13636
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013637static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013638_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013639{
Victor Stinner2740e462016-09-06 16:58:36 -070013640 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013641 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13642 return -1;
13643 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13644 writer->pos++;
13645 return 0;
13646}
13647
13648int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013649_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13650{
13651 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13652}
13653
13654int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013655_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13656{
13657 Py_UCS4 maxchar;
13658 Py_ssize_t len;
13659
13660 if (PyUnicode_READY(str) == -1)
13661 return -1;
13662 len = PyUnicode_GET_LENGTH(str);
13663 if (len == 0)
13664 return 0;
13665 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13666 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013667 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013668 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013669 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013670 Py_INCREF(str);
13671 writer->buffer = str;
13672 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013673 writer->pos += len;
13674 return 0;
13675 }
13676 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13677 return -1;
13678 }
13679 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13680 str, 0, len);
13681 writer->pos += len;
13682 return 0;
13683}
13684
Victor Stinnere215d962012-10-06 23:03:36 +020013685int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013686_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13687 Py_ssize_t start, Py_ssize_t end)
13688{
13689 Py_UCS4 maxchar;
13690 Py_ssize_t len;
13691
13692 if (PyUnicode_READY(str) == -1)
13693 return -1;
13694
13695 assert(0 <= start);
13696 assert(end <= PyUnicode_GET_LENGTH(str));
13697 assert(start <= end);
13698
13699 if (end == 0)
13700 return 0;
13701
13702 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13703 return _PyUnicodeWriter_WriteStr(writer, str);
13704
13705 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13706 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13707 else
13708 maxchar = writer->maxchar;
13709 len = end - start;
13710
13711 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13712 return -1;
13713
13714 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13715 str, start, len);
13716 writer->pos += len;
13717 return 0;
13718}
13719
13720int
Victor Stinner4a587072013-11-19 12:54:53 +010013721_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13722 const char *ascii, Py_ssize_t len)
13723{
13724 if (len == -1)
13725 len = strlen(ascii);
13726
13727 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13728
13729 if (writer->buffer == NULL && !writer->overallocate) {
13730 PyObject *str;
13731
13732 str = _PyUnicode_FromASCII(ascii, len);
13733 if (str == NULL)
13734 return -1;
13735
13736 writer->readonly = 1;
13737 writer->buffer = str;
13738 _PyUnicodeWriter_Update(writer);
13739 writer->pos += len;
13740 return 0;
13741 }
13742
13743 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13744 return -1;
13745
13746 switch (writer->kind)
13747 {
13748 case PyUnicode_1BYTE_KIND:
13749 {
13750 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13751 Py_UCS1 *data = writer->data;
13752
Christian Heimesf051e432016-09-13 20:22:02 +020013753 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013754 break;
13755 }
13756 case PyUnicode_2BYTE_KIND:
13757 {
13758 _PyUnicode_CONVERT_BYTES(
13759 Py_UCS1, Py_UCS2,
13760 ascii, ascii + len,
13761 (Py_UCS2 *)writer->data + writer->pos);
13762 break;
13763 }
13764 case PyUnicode_4BYTE_KIND:
13765 {
13766 _PyUnicode_CONVERT_BYTES(
13767 Py_UCS1, Py_UCS4,
13768 ascii, ascii + len,
13769 (Py_UCS4 *)writer->data + writer->pos);
13770 break;
13771 }
13772 default:
13773 assert(0);
13774 }
13775
13776 writer->pos += len;
13777 return 0;
13778}
13779
13780int
13781_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13782 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013783{
13784 Py_UCS4 maxchar;
13785
13786 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13787 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13788 return -1;
13789 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13790 writer->pos += len;
13791 return 0;
13792}
13793
Victor Stinnerd3f08822012-05-29 12:57:52 +020013794PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013795_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013796{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013797 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013798
Victor Stinnerd3f08822012-05-29 12:57:52 +020013799 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013800 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013801 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013803
13804 str = writer->buffer;
13805 writer->buffer = NULL;
13806
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013807 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013808 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13809 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013810 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013811
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013812 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13813 PyObject *str2;
13814 str2 = resize_compact(str, writer->pos);
13815 if (str2 == NULL) {
13816 Py_DECREF(str);
13817 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013818 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013819 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013820 }
13821
Victor Stinner15a0bd32013-07-08 22:29:55 +020013822 assert(_PyUnicode_CheckConsistency(str, 1));
13823 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013824}
13825
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013827_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013828{
13829 Py_CLEAR(writer->buffer);
13830}
13831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013832#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013833
13834PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013835 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013836\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013837Return a formatted version of S, using substitutions from args and kwargs.\n\
13838The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013839
Eric Smith27bbca62010-11-04 17:06:58 +000013840PyDoc_STRVAR(format_map__doc__,
13841 "S.format_map(mapping) -> str\n\
13842\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013843Return a formatted version of S, using substitutions from mapping.\n\
13844The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013845
INADA Naoki3ae20562017-01-16 20:41:20 +090013846/*[clinic input]
13847str.__format__ as unicode___format__
13848
13849 format_spec: unicode
13850 /
13851
13852Return a formatted version of the string as described by format_spec.
13853[clinic start generated code]*/
13854
Eric Smith4a7d76d2008-05-30 18:10:19 +000013855static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013856unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013857/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013858{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 _PyUnicodeWriter writer;
13860 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013861
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862 if (PyUnicode_READY(self) == -1)
13863 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013864 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013865 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13866 self, format_spec, 0,
13867 PyUnicode_GET_LENGTH(format_spec));
13868 if (ret == -1) {
13869 _PyUnicodeWriter_Dealloc(&writer);
13870 return NULL;
13871 }
13872 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013873}
13874
INADA Naoki3ae20562017-01-16 20:41:20 +090013875/*[clinic input]
13876str.__sizeof__ as unicode_sizeof
13877
13878Return the size of the string in memory, in bytes.
13879[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013880
13881static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013882unicode_sizeof_impl(PyObject *self)
13883/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013884{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 Py_ssize_t size;
13886
13887 /* If it's a compact object, account for base structure +
13888 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 if (PyUnicode_IS_COMPACT_ASCII(self))
13890 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13891 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013893 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894 else {
13895 /* If it is a two-block object, account for base object, and
13896 for character block if present. */
13897 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 if (_PyUnicode_DATA_ANY(self))
13899 size += (PyUnicode_GET_LENGTH(self) + 1) *
13900 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 }
13902 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013903 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013904 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13905 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13906 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13907 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013908
13909 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013910}
13911
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013912static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013913unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013914{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013915 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013916 if (!copy)
13917 return NULL;
13918 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013919}
13920
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013922 UNICODE_ENCODE_METHODDEF
13923 UNICODE_REPLACE_METHODDEF
13924 UNICODE_SPLIT_METHODDEF
13925 UNICODE_RSPLIT_METHODDEF
13926 UNICODE_JOIN_METHODDEF
13927 UNICODE_CAPITALIZE_METHODDEF
13928 UNICODE_CASEFOLD_METHODDEF
13929 UNICODE_TITLE_METHODDEF
13930 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013931 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013932 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013933 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013934 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013935 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013936 UNICODE_LJUST_METHODDEF
13937 UNICODE_LOWER_METHODDEF
13938 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013939 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13940 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013941 UNICODE_RJUST_METHODDEF
13942 UNICODE_RSTRIP_METHODDEF
13943 UNICODE_RPARTITION_METHODDEF
13944 UNICODE_SPLITLINES_METHODDEF
13945 UNICODE_STRIP_METHODDEF
13946 UNICODE_SWAPCASE_METHODDEF
13947 UNICODE_TRANSLATE_METHODDEF
13948 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013949 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13950 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013951 UNICODE_ISLOWER_METHODDEF
13952 UNICODE_ISUPPER_METHODDEF
13953 UNICODE_ISTITLE_METHODDEF
13954 UNICODE_ISSPACE_METHODDEF
13955 UNICODE_ISDECIMAL_METHODDEF
13956 UNICODE_ISDIGIT_METHODDEF
13957 UNICODE_ISNUMERIC_METHODDEF
13958 UNICODE_ISALPHA_METHODDEF
13959 UNICODE_ISALNUM_METHODDEF
13960 UNICODE_ISIDENTIFIER_METHODDEF
13961 UNICODE_ISPRINTABLE_METHODDEF
13962 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013963 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013964 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013965 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013966 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013967 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013968#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013969 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013970 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971#endif
13972
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974 {NULL, NULL}
13975};
13976
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013977static PyObject *
13978unicode_mod(PyObject *v, PyObject *w)
13979{
Brian Curtindfc80e32011-08-10 20:28:54 -050013980 if (!PyUnicode_Check(v))
13981 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013983}
13984
13985static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 0, /*nb_add*/
13987 0, /*nb_subtract*/
13988 0, /*nb_multiply*/
13989 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013990};
13991
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 (lenfunc) unicode_length, /* sq_length */
13994 PyUnicode_Concat, /* sq_concat */
13995 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13996 (ssizeargfunc) unicode_getitem, /* sq_item */
13997 0, /* sq_slice */
13998 0, /* sq_ass_item */
13999 0, /* sq_ass_slice */
14000 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001};
14002
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014003static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014004unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014006 if (PyUnicode_READY(self) == -1)
14007 return NULL;
14008
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014009 if (PyIndex_Check(item)) {
14010 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014011 if (i == -1 && PyErr_Occurred())
14012 return NULL;
14013 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014014 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014015 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014016 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014017 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014018 PyObject *result;
14019 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014020 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014021 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014022
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014023 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014024 return NULL;
14025 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014026 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14027 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014028
14029 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014030 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014031 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014032 slicelength == PyUnicode_GET_LENGTH(self)) {
14033 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014034 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014035 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014036 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014037 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014039 src_kind = PyUnicode_KIND(self);
14040 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014041 if (!PyUnicode_IS_ASCII(self)) {
14042 kind_limit = kind_maxchar_limit(src_kind);
14043 max_char = 0;
14044 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14045 ch = PyUnicode_READ(src_kind, src_data, cur);
14046 if (ch > max_char) {
14047 max_char = ch;
14048 if (max_char >= kind_limit)
14049 break;
14050 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014051 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014052 }
Victor Stinner55c99112011-10-13 01:17:06 +020014053 else
14054 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014055 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014056 if (result == NULL)
14057 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014058 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014059 dest_data = PyUnicode_DATA(result);
14060
14061 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014062 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14063 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014064 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014065 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014066 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014067 } else {
14068 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14069 return NULL;
14070 }
14071}
14072
14073static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014074 (lenfunc)unicode_length, /* mp_length */
14075 (binaryfunc)unicode_subscript, /* mp_subscript */
14076 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014077};
14078
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080/* Helpers for PyUnicode_Format() */
14081
Victor Stinnera47082312012-10-04 02:19:54 +020014082struct unicode_formatter_t {
14083 PyObject *args;
14084 int args_owned;
14085 Py_ssize_t arglen, argidx;
14086 PyObject *dict;
14087
14088 enum PyUnicode_Kind fmtkind;
14089 Py_ssize_t fmtcnt, fmtpos;
14090 void *fmtdata;
14091 PyObject *fmtstr;
14092
14093 _PyUnicodeWriter writer;
14094};
14095
14096struct unicode_format_arg_t {
14097 Py_UCS4 ch;
14098 int flags;
14099 Py_ssize_t width;
14100 int prec;
14101 int sign;
14102};
14103
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014105unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014106{
Victor Stinnera47082312012-10-04 02:19:54 +020014107 Py_ssize_t argidx = ctx->argidx;
14108
14109 if (argidx < ctx->arglen) {
14110 ctx->argidx++;
14111 if (ctx->arglen < 0)
14112 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014113 else
Victor Stinnera47082312012-10-04 02:19:54 +020014114 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115 }
14116 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014117 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 return NULL;
14119}
14120
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014121/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014122
Victor Stinnera47082312012-10-04 02:19:54 +020014123/* Format a float into the writer if the writer is not NULL, or into *p_output
14124 otherwise.
14125
14126 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014127static int
Victor Stinnera47082312012-10-04 02:19:54 +020014128formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14129 PyObject **p_output,
14130 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014132 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014134 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014135 int prec;
14136 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014137
Guido van Rossumd57fd912000-03-10 22:53:23 +000014138 x = PyFloat_AsDouble(v);
14139 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014140 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014141
Victor Stinnera47082312012-10-04 02:19:54 +020014142 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014143 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014145
Victor Stinnera47082312012-10-04 02:19:54 +020014146 if (arg->flags & F_ALT)
14147 dtoa_flags = Py_DTSF_ALT;
14148 else
14149 dtoa_flags = 0;
14150 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014151 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 return -1;
14153 len = strlen(p);
14154 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014155 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014156 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014157 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014158 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014159 }
14160 else
14161 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014162 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014163 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014164}
14165
Victor Stinnerd0880d52012-04-27 23:40:13 +020014166/* formatlong() emulates the format codes d, u, o, x and X, and
14167 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14168 * Python's regular ints.
14169 * Return value: a new PyUnicodeObject*, or NULL if error.
14170 * The output string is of the form
14171 * "-"? ("0x" | "0X")? digit+
14172 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14173 * set in flags. The case of hex digits will be correct,
14174 * There will be at least prec digits, zero-filled on the left if
14175 * necessary to get that many.
14176 * val object to be converted
14177 * flags bitmask of format flags; only F_ALT is looked at
14178 * prec minimum number of digits; 0-fill on left if needed
14179 * type a character in [duoxX]; u acts the same as d
14180 *
14181 * CAUTION: o, x and X conversions on regular ints can never
14182 * produce a '-' sign, but can for Python's unbounded ints.
14183 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014184PyObject *
14185_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014186{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014187 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014188 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014189 Py_ssize_t i;
14190 int sign; /* 1 if '-', else 0 */
14191 int len; /* number of characters */
14192 Py_ssize_t llen;
14193 int numdigits; /* len == numnondigits + numdigits */
14194 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014195
Victor Stinnerd0880d52012-04-27 23:40:13 +020014196 /* Avoid exceeding SSIZE_T_MAX */
14197 if (prec > INT_MAX-3) {
14198 PyErr_SetString(PyExc_OverflowError,
14199 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 }
14202
14203 assert(PyLong_Check(val));
14204
14205 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014206 default:
14207 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014208 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014209 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014211 /* int and int subclasses should print numerically when a numeric */
14212 /* format code is used (see issue18780) */
14213 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014214 break;
14215 case 'o':
14216 numnondigits = 2;
14217 result = PyNumber_ToBase(val, 8);
14218 break;
14219 case 'x':
14220 case 'X':
14221 numnondigits = 2;
14222 result = PyNumber_ToBase(val, 16);
14223 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014224 }
14225 if (!result)
14226 return NULL;
14227
14228 assert(unicode_modifiable(result));
14229 assert(PyUnicode_IS_READY(result));
14230 assert(PyUnicode_IS_ASCII(result));
14231
14232 /* To modify the string in-place, there can only be one reference. */
14233 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014234 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014235 PyErr_BadInternalCall();
14236 return NULL;
14237 }
14238 buf = PyUnicode_DATA(result);
14239 llen = PyUnicode_GET_LENGTH(result);
14240 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014241 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014242 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014243 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014244 return NULL;
14245 }
14246 len = (int)llen;
14247 sign = buf[0] == '-';
14248 numnondigits += sign;
14249 numdigits = len - numnondigits;
14250 assert(numdigits > 0);
14251
14252 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014253 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014254 (type == 'o' || type == 'x' || type == 'X'))) {
14255 assert(buf[sign] == '0');
14256 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14257 buf[sign+1] == 'o');
14258 numnondigits -= 2;
14259 buf += 2;
14260 len -= 2;
14261 if (sign)
14262 buf[0] = '-';
14263 assert(len == numnondigits + numdigits);
14264 assert(numdigits > 0);
14265 }
14266
14267 /* Fill with leading zeroes to meet minimum width. */
14268 if (prec > numdigits) {
14269 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14270 numnondigits + prec);
14271 char *b1;
14272 if (!r1) {
14273 Py_DECREF(result);
14274 return NULL;
14275 }
14276 b1 = PyBytes_AS_STRING(r1);
14277 for (i = 0; i < numnondigits; ++i)
14278 *b1++ = *buf++;
14279 for (i = 0; i < prec - numdigits; i++)
14280 *b1++ = '0';
14281 for (i = 0; i < numdigits; i++)
14282 *b1++ = *buf++;
14283 *b1 = '\0';
14284 Py_DECREF(result);
14285 result = r1;
14286 buf = PyBytes_AS_STRING(result);
14287 len = numnondigits + prec;
14288 }
14289
14290 /* Fix up case for hex conversions. */
14291 if (type == 'X') {
14292 /* Need to convert all lower case letters to upper case.
14293 and need to convert 0x to 0X (and -0x to -0X). */
14294 for (i = 0; i < len; i++)
14295 if (buf[i] >= 'a' && buf[i] <= 'x')
14296 buf[i] -= 'a'-'A';
14297 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 if (!PyUnicode_Check(result)
14299 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014300 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014301 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014302 Py_DECREF(result);
14303 result = unicode;
14304 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014305 else if (len != PyUnicode_GET_LENGTH(result)) {
14306 if (PyUnicode_Resize(&result, len) < 0)
14307 Py_CLEAR(result);
14308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014310}
14311
Ethan Furmandf3ed242014-01-05 06:50:30 -080014312/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014313 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014314 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315 * -1 and raise an exception on error */
14316static int
Victor Stinnera47082312012-10-04 02:19:54 +020014317mainformatlong(PyObject *v,
14318 struct unicode_format_arg_t *arg,
14319 PyObject **p_output,
14320 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014321{
14322 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014323 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014324
14325 if (!PyNumber_Check(v))
14326 goto wrongtype;
14327
Ethan Furman9ab74802014-03-21 06:38:46 -070014328 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014329 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014330 if (type == 'o' || type == 'x' || type == 'X') {
14331 iobj = PyNumber_Index(v);
14332 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014333 if (PyErr_ExceptionMatches(PyExc_TypeError))
14334 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014335 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014336 }
14337 }
14338 else {
14339 iobj = PyNumber_Long(v);
14340 if (iobj == NULL ) {
14341 if (PyErr_ExceptionMatches(PyExc_TypeError))
14342 goto wrongtype;
14343 return -1;
14344 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014345 }
14346 assert(PyLong_Check(iobj));
14347 }
14348 else {
14349 iobj = v;
14350 Py_INCREF(iobj);
14351 }
14352
14353 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014354 && arg->width == -1 && arg->prec == -1
14355 && !(arg->flags & (F_SIGN | F_BLANK))
14356 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 {
14358 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014359 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014360 int base;
14361
Victor Stinnera47082312012-10-04 02:19:54 +020014362 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014363 {
14364 default:
14365 assert(0 && "'type' not in [diuoxX]");
14366 case 'd':
14367 case 'i':
14368 case 'u':
14369 base = 10;
14370 break;
14371 case 'o':
14372 base = 8;
14373 break;
14374 case 'x':
14375 case 'X':
14376 base = 16;
14377 break;
14378 }
14379
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014380 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14381 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014382 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014383 }
14384 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014385 return 1;
14386 }
14387
Ethan Furmanb95b5612015-01-23 20:05:18 -080014388 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014389 Py_DECREF(iobj);
14390 if (res == NULL)
14391 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014392 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014393 return 0;
14394
14395wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014396 switch(type)
14397 {
14398 case 'o':
14399 case 'x':
14400 case 'X':
14401 PyErr_Format(PyExc_TypeError,
14402 "%%%c format: an integer is required, "
14403 "not %.200s",
14404 type, Py_TYPE(v)->tp_name);
14405 break;
14406 default:
14407 PyErr_Format(PyExc_TypeError,
14408 "%%%c format: a number is required, "
14409 "not %.200s",
14410 type, Py_TYPE(v)->tp_name);
14411 break;
14412 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014413 return -1;
14414}
14415
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014416static Py_UCS4
14417formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014419 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014420 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014421 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014422 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014423 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014424 goto onError;
14425 }
14426 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014427 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014429 /* make sure number is a type of integer */
14430 if (!PyLong_Check(v)) {
14431 iobj = PyNumber_Index(v);
14432 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014433 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014434 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014435 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014436 Py_DECREF(iobj);
14437 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014438 else {
14439 x = PyLong_AsLong(v);
14440 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014441 if (x == -1 && PyErr_Occurred())
14442 goto onError;
14443
Victor Stinner8faf8212011-12-08 22:14:11 +010014444 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014445 PyErr_SetString(PyExc_OverflowError,
14446 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014447 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014448 }
14449
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014450 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014451 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014452
Benjamin Peterson29060642009-01-31 22:14:21 +000014453 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014454 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014455 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014456 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014457}
14458
Victor Stinnera47082312012-10-04 02:19:54 +020014459/* Parse options of an argument: flags, width, precision.
14460 Handle also "%(name)" syntax.
14461
14462 Return 0 if the argument has been formatted into arg->str.
14463 Return 1 if the argument has been written into ctx->writer,
14464 Raise an exception and return -1 on error. */
14465static int
14466unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14467 struct unicode_format_arg_t *arg)
14468{
14469#define FORMAT_READ(ctx) \
14470 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14471
14472 PyObject *v;
14473
Victor Stinnera47082312012-10-04 02:19:54 +020014474 if (arg->ch == '(') {
14475 /* Get argument value from a dictionary. Example: "%(name)s". */
14476 Py_ssize_t keystart;
14477 Py_ssize_t keylen;
14478 PyObject *key;
14479 int pcount = 1;
14480
14481 if (ctx->dict == NULL) {
14482 PyErr_SetString(PyExc_TypeError,
14483 "format requires a mapping");
14484 return -1;
14485 }
14486 ++ctx->fmtpos;
14487 --ctx->fmtcnt;
14488 keystart = ctx->fmtpos;
14489 /* Skip over balanced parentheses */
14490 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14491 arg->ch = FORMAT_READ(ctx);
14492 if (arg->ch == ')')
14493 --pcount;
14494 else if (arg->ch == '(')
14495 ++pcount;
14496 ctx->fmtpos++;
14497 }
14498 keylen = ctx->fmtpos - keystart - 1;
14499 if (ctx->fmtcnt < 0 || pcount > 0) {
14500 PyErr_SetString(PyExc_ValueError,
14501 "incomplete format key");
14502 return -1;
14503 }
14504 key = PyUnicode_Substring(ctx->fmtstr,
14505 keystart, keystart + keylen);
14506 if (key == NULL)
14507 return -1;
14508 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014509 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014510 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014511 }
14512 ctx->args = PyObject_GetItem(ctx->dict, key);
14513 Py_DECREF(key);
14514 if (ctx->args == NULL)
14515 return -1;
14516 ctx->args_owned = 1;
14517 ctx->arglen = -1;
14518 ctx->argidx = -2;
14519 }
14520
14521 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014522 while (--ctx->fmtcnt >= 0) {
14523 arg->ch = FORMAT_READ(ctx);
14524 ctx->fmtpos++;
14525 switch (arg->ch) {
14526 case '-': arg->flags |= F_LJUST; continue;
14527 case '+': arg->flags |= F_SIGN; continue;
14528 case ' ': arg->flags |= F_BLANK; continue;
14529 case '#': arg->flags |= F_ALT; continue;
14530 case '0': arg->flags |= F_ZERO; continue;
14531 }
14532 break;
14533 }
14534
14535 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014536 if (arg->ch == '*') {
14537 v = unicode_format_getnextarg(ctx);
14538 if (v == NULL)
14539 return -1;
14540 if (!PyLong_Check(v)) {
14541 PyErr_SetString(PyExc_TypeError,
14542 "* wants int");
14543 return -1;
14544 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014545 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014546 if (arg->width == -1 && PyErr_Occurred())
14547 return -1;
14548 if (arg->width < 0) {
14549 arg->flags |= F_LJUST;
14550 arg->width = -arg->width;
14551 }
14552 if (--ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 ctx->fmtpos++;
14555 }
14556 }
14557 else if (arg->ch >= '0' && arg->ch <= '9') {
14558 arg->width = arg->ch - '0';
14559 while (--ctx->fmtcnt >= 0) {
14560 arg->ch = FORMAT_READ(ctx);
14561 ctx->fmtpos++;
14562 if (arg->ch < '0' || arg->ch > '9')
14563 break;
14564 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14565 mixing signed and unsigned comparison. Since arg->ch is between
14566 '0' and '9', casting to int is safe. */
14567 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14568 PyErr_SetString(PyExc_ValueError,
14569 "width too big");
14570 return -1;
14571 }
14572 arg->width = arg->width*10 + (arg->ch - '0');
14573 }
14574 }
14575
14576 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014577 if (arg->ch == '.') {
14578 arg->prec = 0;
14579 if (--ctx->fmtcnt >= 0) {
14580 arg->ch = FORMAT_READ(ctx);
14581 ctx->fmtpos++;
14582 }
14583 if (arg->ch == '*') {
14584 v = unicode_format_getnextarg(ctx);
14585 if (v == NULL)
14586 return -1;
14587 if (!PyLong_Check(v)) {
14588 PyErr_SetString(PyExc_TypeError,
14589 "* wants int");
14590 return -1;
14591 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014592 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014593 if (arg->prec == -1 && PyErr_Occurred())
14594 return -1;
14595 if (arg->prec < 0)
14596 arg->prec = 0;
14597 if (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 }
14601 }
14602 else if (arg->ch >= '0' && arg->ch <= '9') {
14603 arg->prec = arg->ch - '0';
14604 while (--ctx->fmtcnt >= 0) {
14605 arg->ch = FORMAT_READ(ctx);
14606 ctx->fmtpos++;
14607 if (arg->ch < '0' || arg->ch > '9')
14608 break;
14609 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14610 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014611 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014612 return -1;
14613 }
14614 arg->prec = arg->prec*10 + (arg->ch - '0');
14615 }
14616 }
14617 }
14618
14619 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14620 if (ctx->fmtcnt >= 0) {
14621 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14622 if (--ctx->fmtcnt >= 0) {
14623 arg->ch = FORMAT_READ(ctx);
14624 ctx->fmtpos++;
14625 }
14626 }
14627 }
14628 if (ctx->fmtcnt < 0) {
14629 PyErr_SetString(PyExc_ValueError,
14630 "incomplete format");
14631 return -1;
14632 }
14633 return 0;
14634
14635#undef FORMAT_READ
14636}
14637
14638/* Format one argument. Supported conversion specifiers:
14639
14640 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014641 - "i", "d", "u": int or float
14642 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014643 - "e", "E", "f", "F", "g", "G": float
14644 - "c": int or str (1 character)
14645
Victor Stinner8dbd4212012-12-04 09:30:24 +010014646 When possible, the output is written directly into the Unicode writer
14647 (ctx->writer). A string is created when padding is required.
14648
Victor Stinnera47082312012-10-04 02:19:54 +020014649 Return 0 if the argument has been formatted into *p_str,
14650 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014651 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014652static int
14653unicode_format_arg_format(struct unicode_formatter_t *ctx,
14654 struct unicode_format_arg_t *arg,
14655 PyObject **p_str)
14656{
14657 PyObject *v;
14658 _PyUnicodeWriter *writer = &ctx->writer;
14659
14660 if (ctx->fmtcnt == 0)
14661 ctx->writer.overallocate = 0;
14662
Victor Stinnera47082312012-10-04 02:19:54 +020014663 v = unicode_format_getnextarg(ctx);
14664 if (v == NULL)
14665 return -1;
14666
Victor Stinnera47082312012-10-04 02:19:54 +020014667
14668 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014669 case 's':
14670 case 'r':
14671 case 'a':
14672 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14673 /* Fast path */
14674 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14675 return -1;
14676 return 1;
14677 }
14678
14679 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14680 *p_str = v;
14681 Py_INCREF(*p_str);
14682 }
14683 else {
14684 if (arg->ch == 's')
14685 *p_str = PyObject_Str(v);
14686 else if (arg->ch == 'r')
14687 *p_str = PyObject_Repr(v);
14688 else
14689 *p_str = PyObject_ASCII(v);
14690 }
14691 break;
14692
14693 case 'i':
14694 case 'd':
14695 case 'u':
14696 case 'o':
14697 case 'x':
14698 case 'X':
14699 {
14700 int ret = mainformatlong(v, arg, p_str, writer);
14701 if (ret != 0)
14702 return ret;
14703 arg->sign = 1;
14704 break;
14705 }
14706
14707 case 'e':
14708 case 'E':
14709 case 'f':
14710 case 'F':
14711 case 'g':
14712 case 'G':
14713 if (arg->width == -1 && arg->prec == -1
14714 && !(arg->flags & (F_SIGN | F_BLANK)))
14715 {
14716 /* Fast path */
14717 if (formatfloat(v, arg, NULL, writer) == -1)
14718 return -1;
14719 return 1;
14720 }
14721
14722 arg->sign = 1;
14723 if (formatfloat(v, arg, p_str, NULL) == -1)
14724 return -1;
14725 break;
14726
14727 case 'c':
14728 {
14729 Py_UCS4 ch = formatchar(v);
14730 if (ch == (Py_UCS4) -1)
14731 return -1;
14732 if (arg->width == -1 && arg->prec == -1) {
14733 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014734 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014735 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014736 return 1;
14737 }
14738 *p_str = PyUnicode_FromOrdinal(ch);
14739 break;
14740 }
14741
14742 default:
14743 PyErr_Format(PyExc_ValueError,
14744 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014745 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014746 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14747 (int)arg->ch,
14748 ctx->fmtpos - 1);
14749 return -1;
14750 }
14751 if (*p_str == NULL)
14752 return -1;
14753 assert (PyUnicode_Check(*p_str));
14754 return 0;
14755}
14756
14757static int
14758unicode_format_arg_output(struct unicode_formatter_t *ctx,
14759 struct unicode_format_arg_t *arg,
14760 PyObject *str)
14761{
14762 Py_ssize_t len;
14763 enum PyUnicode_Kind kind;
14764 void *pbuf;
14765 Py_ssize_t pindex;
14766 Py_UCS4 signchar;
14767 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014768 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014769 Py_ssize_t sublen;
14770 _PyUnicodeWriter *writer = &ctx->writer;
14771 Py_UCS4 fill;
14772
14773 fill = ' ';
14774 if (arg->sign && arg->flags & F_ZERO)
14775 fill = '0';
14776
14777 if (PyUnicode_READY(str) == -1)
14778 return -1;
14779
14780 len = PyUnicode_GET_LENGTH(str);
14781 if ((arg->width == -1 || arg->width <= len)
14782 && (arg->prec == -1 || arg->prec >= len)
14783 && !(arg->flags & (F_SIGN | F_BLANK)))
14784 {
14785 /* Fast path */
14786 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14787 return -1;
14788 return 0;
14789 }
14790
14791 /* Truncate the string for "s", "r" and "a" formats
14792 if the precision is set */
14793 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14794 if (arg->prec >= 0 && len > arg->prec)
14795 len = arg->prec;
14796 }
14797
14798 /* Adjust sign and width */
14799 kind = PyUnicode_KIND(str);
14800 pbuf = PyUnicode_DATA(str);
14801 pindex = 0;
14802 signchar = '\0';
14803 if (arg->sign) {
14804 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14805 if (ch == '-' || ch == '+') {
14806 signchar = ch;
14807 len--;
14808 pindex++;
14809 }
14810 else if (arg->flags & F_SIGN)
14811 signchar = '+';
14812 else if (arg->flags & F_BLANK)
14813 signchar = ' ';
14814 else
14815 arg->sign = 0;
14816 }
14817 if (arg->width < len)
14818 arg->width = len;
14819
14820 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014821 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014822 if (!(arg->flags & F_LJUST)) {
14823 if (arg->sign) {
14824 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014825 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014826 }
14827 else {
14828 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014829 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014830 }
14831 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014832 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14833 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014834 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014835 }
14836
Victor Stinnera47082312012-10-04 02:19:54 +020014837 buflen = arg->width;
14838 if (arg->sign && len == arg->width)
14839 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014840 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014841 return -1;
14842
14843 /* Write the sign if needed */
14844 if (arg->sign) {
14845 if (fill != ' ') {
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14847 writer->pos += 1;
14848 }
14849 if (arg->width > len)
14850 arg->width--;
14851 }
14852
14853 /* Write the numeric prefix for "x", "X" and "o" formats
14854 if the alternate form is used.
14855 For example, write "0x" for the "%#x" format. */
14856 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14857 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14858 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14859 if (fill != ' ') {
14860 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14862 writer->pos += 2;
14863 pindex += 2;
14864 }
14865 arg->width -= 2;
14866 if (arg->width < 0)
14867 arg->width = 0;
14868 len -= 2;
14869 }
14870
14871 /* Pad left with the fill character if needed */
14872 if (arg->width > len && !(arg->flags & F_LJUST)) {
14873 sublen = arg->width - len;
14874 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14875 writer->pos += sublen;
14876 arg->width = len;
14877 }
14878
14879 /* If padding with spaces: write sign if needed and/or numeric prefix if
14880 the alternate form is used */
14881 if (fill == ' ') {
14882 if (arg->sign) {
14883 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14884 writer->pos += 1;
14885 }
14886 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14887 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14888 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14889 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14890 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14891 writer->pos += 2;
14892 pindex += 2;
14893 }
14894 }
14895
14896 /* Write characters */
14897 if (len) {
14898 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14899 str, pindex, len);
14900 writer->pos += len;
14901 }
14902
14903 /* Pad right with the fill character if needed */
14904 if (arg->width > len) {
14905 sublen = arg->width - len;
14906 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14907 writer->pos += sublen;
14908 }
14909 return 0;
14910}
14911
14912/* Helper of PyUnicode_Format(): format one arg.
14913 Return 0 on success, raise an exception and return -1 on error. */
14914static int
14915unicode_format_arg(struct unicode_formatter_t *ctx)
14916{
14917 struct unicode_format_arg_t arg;
14918 PyObject *str;
14919 int ret;
14920
Victor Stinner8dbd4212012-12-04 09:30:24 +010014921 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014922 if (arg.ch == '%') {
14923 ctx->fmtpos++;
14924 ctx->fmtcnt--;
14925 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14926 return -1;
14927 return 0;
14928 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014929 arg.flags = 0;
14930 arg.width = -1;
14931 arg.prec = -1;
14932 arg.sign = 0;
14933 str = NULL;
14934
Victor Stinnera47082312012-10-04 02:19:54 +020014935 ret = unicode_format_arg_parse(ctx, &arg);
14936 if (ret == -1)
14937 return -1;
14938
14939 ret = unicode_format_arg_format(ctx, &arg, &str);
14940 if (ret == -1)
14941 return -1;
14942
14943 if (ret != 1) {
14944 ret = unicode_format_arg_output(ctx, &arg, str);
14945 Py_DECREF(str);
14946 if (ret == -1)
14947 return -1;
14948 }
14949
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014950 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014951 PyErr_SetString(PyExc_TypeError,
14952 "not all arguments converted during string formatting");
14953 return -1;
14954 }
14955 return 0;
14956}
14957
Alexander Belopolsky40018472011-02-26 01:02:56 +000014958PyObject *
14959PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960{
Victor Stinnera47082312012-10-04 02:19:54 +020014961 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014962
Guido van Rossumd57fd912000-03-10 22:53:23 +000014963 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014964 PyErr_BadInternalCall();
14965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966 }
Victor Stinnera47082312012-10-04 02:19:54 +020014967
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014968 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014969 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014970
14971 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014972 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14973 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14974 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14975 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014976
Victor Stinner8f674cc2013-04-17 23:02:17 +020014977 _PyUnicodeWriter_Init(&ctx.writer);
14978 ctx.writer.min_length = ctx.fmtcnt + 100;
14979 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014980
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx.arglen = PyTuple_Size(args);
14983 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014984 }
14985 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014986 ctx.arglen = -1;
14987 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988 }
Victor Stinnera47082312012-10-04 02:19:54 +020014989 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014990 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014991 ctx.dict = args;
14992 else
14993 ctx.dict = NULL;
14994 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014995
Victor Stinnera47082312012-10-04 02:19:54 +020014996 while (--ctx.fmtcnt >= 0) {
14997 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014998 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014999
15000 nonfmtpos = ctx.fmtpos++;
15001 while (ctx.fmtcnt >= 0 &&
15002 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15003 ctx.fmtpos++;
15004 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 }
Victor Stinnera47082312012-10-04 02:19:54 +020015006 if (ctx.fmtcnt < 0) {
15007 ctx.fmtpos--;
15008 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015009 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015010
Victor Stinnercfc4c132013-04-03 01:48:39 +020015011 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15012 nonfmtpos, ctx.fmtpos) < 0)
15013 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 }
15015 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015016 ctx.fmtpos++;
15017 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015018 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015019 }
15020 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015021
Victor Stinnera47082312012-10-04 02:19:54 +020015022 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015023 PyErr_SetString(PyExc_TypeError,
15024 "not all arguments converted during string formatting");
15025 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015026 }
15027
Victor Stinnera47082312012-10-04 02:19:54 +020015028 if (ctx.args_owned) {
15029 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015030 }
Victor Stinnera47082312012-10-04 02:19:54 +020015031 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015032
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015034 _PyUnicodeWriter_Dealloc(&ctx.writer);
15035 if (ctx.args_owned) {
15036 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015037 }
15038 return NULL;
15039}
15040
Jeremy Hylton938ace62002-07-17 16:30:39 +000015041static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015042unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15043
Tim Peters6d6c1a32001-08-02 04:15:00 +000015044static PyObject *
15045unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15046{
Benjamin Peterson29060642009-01-31 22:14:21 +000015047 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015048 static char *kwlist[] = {"object", "encoding", "errors", 0};
15049 char *encoding = NULL;
15050 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015051
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 if (type != &PyUnicode_Type)
15053 return unicode_subtype_new(type, args, kwds);
15054 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015055 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015056 return NULL;
15057 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015058 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 if (encoding == NULL && errors == NULL)
15060 return PyObject_Str(x);
15061 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015062 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015063}
15064
Guido van Rossume023fe02001-08-30 03:12:59 +000015065static PyObject *
15066unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15067{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015068 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069 Py_ssize_t length, char_size;
15070 int share_wstr, share_utf8;
15071 unsigned int kind;
15072 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015073
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015076 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015079 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015080 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015081 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015082 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015083 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015084
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015085 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015086 if (self == NULL) {
15087 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 return NULL;
15089 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 kind = PyUnicode_KIND(unicode);
15091 length = PyUnicode_GET_LENGTH(unicode);
15092
15093 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015094#ifdef Py_DEBUG
15095 _PyUnicode_HASH(self) = -1;
15096#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015097 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015098#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015099 _PyUnicode_STATE(self).interned = 0;
15100 _PyUnicode_STATE(self).kind = kind;
15101 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015102 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015103 _PyUnicode_STATE(self).ready = 1;
15104 _PyUnicode_WSTR(self) = NULL;
15105 _PyUnicode_UTF8_LENGTH(self) = 0;
15106 _PyUnicode_UTF8(self) = NULL;
15107 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015108 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015109
15110 share_utf8 = 0;
15111 share_wstr = 0;
15112 if (kind == PyUnicode_1BYTE_KIND) {
15113 char_size = 1;
15114 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15115 share_utf8 = 1;
15116 }
15117 else if (kind == PyUnicode_2BYTE_KIND) {
15118 char_size = 2;
15119 if (sizeof(wchar_t) == 2)
15120 share_wstr = 1;
15121 }
15122 else {
15123 assert(kind == PyUnicode_4BYTE_KIND);
15124 char_size = 4;
15125 if (sizeof(wchar_t) == 4)
15126 share_wstr = 1;
15127 }
15128
15129 /* Ensure we won't overflow the length. */
15130 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15131 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015132 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015134 data = PyObject_MALLOC((length + 1) * char_size);
15135 if (data == NULL) {
15136 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015137 goto onError;
15138 }
15139
Victor Stinnerc3c74152011-10-02 20:39:55 +020015140 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015141 if (share_utf8) {
15142 _PyUnicode_UTF8_LENGTH(self) = length;
15143 _PyUnicode_UTF8(self) = data;
15144 }
15145 if (share_wstr) {
15146 _PyUnicode_WSTR_LENGTH(self) = length;
15147 _PyUnicode_WSTR(self) = (wchar_t *)data;
15148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015149
Christian Heimesf051e432016-09-13 20:22:02 +020015150 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015151 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015152 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015153#ifdef Py_DEBUG
15154 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15155#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015156 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015157 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015158
15159onError:
15160 Py_DECREF(unicode);
15161 Py_DECREF(self);
15162 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015163}
15164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015165PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015166"str(object='') -> str\n\
15167str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015168\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015169Create a new string object from the given object. If encoding or\n\
15170errors is specified, then the object must expose a data buffer\n\
15171that will be decoded using the given encoding and error handler.\n\
15172Otherwise, returns the result of object.__str__() (if defined)\n\
15173or repr(object).\n\
15174encoding defaults to sys.getdefaultencoding().\n\
15175errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015176
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015177static PyObject *unicode_iter(PyObject *seq);
15178
Guido van Rossumd57fd912000-03-10 22:53:23 +000015179PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015180 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015181 "str", /* tp_name */
15182 sizeof(PyUnicodeObject), /* tp_size */
15183 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015184 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 (destructor)unicode_dealloc, /* tp_dealloc */
15186 0, /* tp_print */
15187 0, /* tp_getattr */
15188 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015189 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 unicode_repr, /* tp_repr */
15191 &unicode_as_number, /* tp_as_number */
15192 &unicode_as_sequence, /* tp_as_sequence */
15193 &unicode_as_mapping, /* tp_as_mapping */
15194 (hashfunc) unicode_hash, /* tp_hash*/
15195 0, /* tp_call*/
15196 (reprfunc) unicode_str, /* tp_str */
15197 PyObject_GenericGetAttr, /* tp_getattro */
15198 0, /* tp_setattro */
15199 0, /* tp_as_buffer */
15200 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015201 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 unicode_doc, /* tp_doc */
15203 0, /* tp_traverse */
15204 0, /* tp_clear */
15205 PyUnicode_RichCompare, /* tp_richcompare */
15206 0, /* tp_weaklistoffset */
15207 unicode_iter, /* tp_iter */
15208 0, /* tp_iternext */
15209 unicode_methods, /* tp_methods */
15210 0, /* tp_members */
15211 0, /* tp_getset */
15212 &PyBaseObject_Type, /* tp_base */
15213 0, /* tp_dict */
15214 0, /* tp_descr_get */
15215 0, /* tp_descr_set */
15216 0, /* tp_dictoffset */
15217 0, /* tp_init */
15218 0, /* tp_alloc */
15219 unicode_new, /* tp_new */
15220 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015221};
15222
15223/* Initialize the Unicode implementation */
15224
Victor Stinner3a50e702011-10-18 21:21:00 +020015225int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015226{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015227 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015228 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015229 0x000A, /* LINE FEED */
15230 0x000D, /* CARRIAGE RETURN */
15231 0x001C, /* FILE SEPARATOR */
15232 0x001D, /* GROUP SEPARATOR */
15233 0x001E, /* RECORD SEPARATOR */
15234 0x0085, /* NEXT LINE */
15235 0x2028, /* LINE SEPARATOR */
15236 0x2029, /* PARAGRAPH SEPARATOR */
15237 };
15238
Fred Drakee4315f52000-05-09 19:53:39 +000015239 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015240 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015241 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015243 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015244
Guido van Rossumcacfc072002-05-24 19:01:59 +000015245 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015246 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015247
15248 /* initialize the linebreak bloom filter */
15249 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015250 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015251 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015252
Christian Heimes26532f72013-07-20 14:57:16 +020015253 if (PyType_Ready(&EncodingMapType) < 0)
15254 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015255
Benjamin Petersonc4311282012-10-30 23:21:10 -040015256 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15257 Py_FatalError("Can't initialize field name iterator type");
15258
15259 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15260 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015261
Victor Stinner3a50e702011-10-18 21:21:00 +020015262 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015263}
15264
15265/* Finalize the Unicode implementation */
15266
Christian Heimesa156e092008-02-16 07:38:31 +000015267int
15268PyUnicode_ClearFreeList(void)
15269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015270 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015271}
15272
Guido van Rossumd57fd912000-03-10 22:53:23 +000015273void
Thomas Wouters78890102000-07-22 19:25:51 +000015274_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015275{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015276 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015277
Serhiy Storchaka05997252013-01-26 12:14:02 +020015278 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015279
Serhiy Storchaka05997252013-01-26 12:14:02 +020015280 for (i = 0; i < 256; i++)
15281 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015282 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015283 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015285
Walter Dörwald16807132007-05-25 13:52:07 +000015286void
15287PyUnicode_InternInPlace(PyObject **p)
15288{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015289 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015291#ifdef Py_DEBUG
15292 assert(s != NULL);
15293 assert(_PyUnicode_CHECK(s));
15294#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015296 return;
15297#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 /* If it's a subclass, we don't really know what putting
15299 it in the interned dict might do. */
15300 if (!PyUnicode_CheckExact(s))
15301 return;
15302 if (PyUnicode_CHECK_INTERNED(s))
15303 return;
15304 if (interned == NULL) {
15305 interned = PyDict_New();
15306 if (interned == NULL) {
15307 PyErr_Clear(); /* Don't leave an exception */
15308 return;
15309 }
15310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015312 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015314 if (t == NULL) {
15315 PyErr_Clear();
15316 return;
15317 }
15318 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015319 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015320 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015321 return;
15322 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 /* The two references in interned are not counted by refcnt.
15324 The deallocator will take care of this */
15325 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015326 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015327}
15328
15329void
15330PyUnicode_InternImmortal(PyObject **p)
15331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 PyUnicode_InternInPlace(p);
15333 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015334 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 Py_INCREF(*p);
15336 }
Walter Dörwald16807132007-05-25 13:52:07 +000015337}
15338
15339PyObject *
15340PyUnicode_InternFromString(const char *cp)
15341{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 PyObject *s = PyUnicode_FromString(cp);
15343 if (s == NULL)
15344 return NULL;
15345 PyUnicode_InternInPlace(&s);
15346 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015347}
15348
Alexander Belopolsky40018472011-02-26 01:02:56 +000015349void
15350_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015353 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 Py_ssize_t i, n;
15355 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015356
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 if (interned == NULL || !PyDict_Check(interned))
15358 return;
15359 keys = PyDict_Keys(interned);
15360 if (keys == NULL || !PyList_Check(keys)) {
15361 PyErr_Clear();
15362 return;
15363 }
Walter Dörwald16807132007-05-25 13:52:07 +000015364
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15366 detector, interned unicode strings are not forcibly deallocated;
15367 rather, we give them their stolen references back, and then clear
15368 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015369
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 n = PyList_GET_SIZE(keys);
15371 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015372 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015374 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015375 if (PyUnicode_READY(s) == -1) {
15376 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015377 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015379 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 case SSTATE_NOT_INTERNED:
15381 /* XXX Shouldn't happen */
15382 break;
15383 case SSTATE_INTERNED_IMMORTAL:
15384 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015385 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015386 break;
15387 case SSTATE_INTERNED_MORTAL:
15388 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015389 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 break;
15391 default:
15392 Py_FatalError("Inconsistent interned string state.");
15393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015394 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015395 }
15396 fprintf(stderr, "total size of all interned strings: "
15397 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15398 "mortal/immortal\n", mortal_size, immortal_size);
15399 Py_DECREF(keys);
15400 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015401 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015402}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403
15404
15405/********************* Unicode Iterator **************************/
15406
15407typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 PyObject_HEAD
15409 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015410 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015411} unicodeiterobject;
15412
15413static void
15414unicodeiter_dealloc(unicodeiterobject *it)
15415{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 _PyObject_GC_UNTRACK(it);
15417 Py_XDECREF(it->it_seq);
15418 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419}
15420
15421static int
15422unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15423{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 Py_VISIT(it->it_seq);
15425 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015426}
15427
15428static PyObject *
15429unicodeiter_next(unicodeiterobject *it)
15430{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015431 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015432
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 assert(it != NULL);
15434 seq = it->it_seq;
15435 if (seq == NULL)
15436 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015437 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015439 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15440 int kind = PyUnicode_KIND(seq);
15441 void *data = PyUnicode_DATA(seq);
15442 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15443 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 if (item != NULL)
15445 ++it->it_index;
15446 return item;
15447 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015448
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015450 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015451 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015452}
15453
15454static PyObject *
15455unicodeiter_len(unicodeiterobject *it)
15456{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 Py_ssize_t len = 0;
15458 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015459 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015461}
15462
15463PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15464
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015465static PyObject *
15466unicodeiter_reduce(unicodeiterobject *it)
15467{
15468 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015469 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015470 it->it_seq, it->it_index);
15471 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015472 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015473 if (u == NULL)
15474 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015475 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015476 }
15477}
15478
15479PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15480
15481static PyObject *
15482unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15483{
15484 Py_ssize_t index = PyLong_AsSsize_t(state);
15485 if (index == -1 && PyErr_Occurred())
15486 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015487 if (it->it_seq != NULL) {
15488 if (index < 0)
15489 index = 0;
15490 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15491 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15492 it->it_index = index;
15493 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015494 Py_RETURN_NONE;
15495}
15496
15497PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15498
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015499static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015501 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015502 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15503 reduce_doc},
15504 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15505 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015507};
15508
15509PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15511 "str_iterator", /* tp_name */
15512 sizeof(unicodeiterobject), /* tp_basicsize */
15513 0, /* tp_itemsize */
15514 /* methods */
15515 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15516 0, /* tp_print */
15517 0, /* tp_getattr */
15518 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015519 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 0, /* tp_repr */
15521 0, /* tp_as_number */
15522 0, /* tp_as_sequence */
15523 0, /* tp_as_mapping */
15524 0, /* tp_hash */
15525 0, /* tp_call */
15526 0, /* tp_str */
15527 PyObject_GenericGetAttr, /* tp_getattro */
15528 0, /* tp_setattro */
15529 0, /* tp_as_buffer */
15530 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15531 0, /* tp_doc */
15532 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15533 0, /* tp_clear */
15534 0, /* tp_richcompare */
15535 0, /* tp_weaklistoffset */
15536 PyObject_SelfIter, /* tp_iter */
15537 (iternextfunc)unicodeiter_next, /* tp_iternext */
15538 unicodeiter_methods, /* tp_methods */
15539 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015540};
15541
15542static PyObject *
15543unicode_iter(PyObject *seq)
15544{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015545 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015546
Benjamin Peterson14339b62009-01-31 16:36:08 +000015547 if (!PyUnicode_Check(seq)) {
15548 PyErr_BadInternalCall();
15549 return NULL;
15550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015551 if (PyUnicode_READY(seq) == -1)
15552 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015553 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15554 if (it == NULL)
15555 return NULL;
15556 it->it_index = 0;
15557 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015558 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015559 _PyObject_GC_TRACK(it);
15560 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015561}
15562
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015563
15564size_t
15565Py_UNICODE_strlen(const Py_UNICODE *u)
15566{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015567 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015568}
15569
15570Py_UNICODE*
15571Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15572{
15573 Py_UNICODE *u = s1;
15574 while ((*u++ = *s2++));
15575 return s1;
15576}
15577
15578Py_UNICODE*
15579Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15580{
15581 Py_UNICODE *u = s1;
15582 while ((*u++ = *s2++))
15583 if (n-- == 0)
15584 break;
15585 return s1;
15586}
15587
15588Py_UNICODE*
15589Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15590{
15591 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015592 u1 += wcslen(u1);
15593 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015594 return s1;
15595}
15596
15597int
15598Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15599{
15600 while (*s1 && *s2 && *s1 == *s2)
15601 s1++, s2++;
15602 if (*s1 && *s2)
15603 return (*s1 < *s2) ? -1 : +1;
15604 if (*s1)
15605 return 1;
15606 if (*s2)
15607 return -1;
15608 return 0;
15609}
15610
15611int
15612Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15613{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015614 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015615 for (; n != 0; n--) {
15616 u1 = *s1;
15617 u2 = *s2;
15618 if (u1 != u2)
15619 return (u1 < u2) ? -1 : +1;
15620 if (u1 == '\0')
15621 return 0;
15622 s1++;
15623 s2++;
15624 }
15625 return 0;
15626}
15627
15628Py_UNICODE*
15629Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15630{
15631 const Py_UNICODE *p;
15632 for (p = s; *p; p++)
15633 if (*p == c)
15634 return (Py_UNICODE*)p;
15635 return NULL;
15636}
15637
15638Py_UNICODE*
15639Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15640{
15641 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015642 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015643 while (p != s) {
15644 p--;
15645 if (*p == c)
15646 return (Py_UNICODE*)p;
15647 }
15648 return NULL;
15649}
Victor Stinner331ea922010-08-10 16:37:20 +000015650
Victor Stinner71133ff2010-09-01 23:43:53 +000015651Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015652PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015653{
Victor Stinner577db2c2011-10-11 22:12:48 +020015654 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015655 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015657 if (!PyUnicode_Check(unicode)) {
15658 PyErr_BadArgument();
15659 return NULL;
15660 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015661 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015662 if (u == NULL)
15663 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015664 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015665 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015666 PyErr_NoMemory();
15667 return NULL;
15668 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015669 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015670 size *= sizeof(Py_UNICODE);
15671 copy = PyMem_Malloc(size);
15672 if (copy == NULL) {
15673 PyErr_NoMemory();
15674 return NULL;
15675 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015676 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015677 return copy;
15678}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015679
Georg Brandl66c221e2010-10-14 07:04:07 +000015680/* A _string module, to export formatter_parser and formatter_field_name_split
15681 to the string.Formatter class implemented in Python. */
15682
15683static PyMethodDef _string_methods[] = {
15684 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15685 METH_O, PyDoc_STR("split the argument as a field name")},
15686 {"formatter_parser", (PyCFunction) formatter_parser,
15687 METH_O, PyDoc_STR("parse the argument as a format string")},
15688 {NULL, NULL}
15689};
15690
15691static struct PyModuleDef _string_module = {
15692 PyModuleDef_HEAD_INIT,
15693 "_string",
15694 PyDoc_STR("string helper module"),
15695 0,
15696 _string_methods,
15697 NULL,
15698 NULL,
15699 NULL,
15700 NULL
15701};
15702
15703PyMODINIT_FUNC
15704PyInit__string(void)
15705{
15706 return PyModule_Create(&_string_module);
15707}
15708
15709
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015710#ifdef __cplusplus
15711}
15712#endif