blob: 0e6cb7f2a5e666e7c1a3976742467b926dfb255c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002956/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957
Victor Stinnerd88d9832011-09-06 02:00:05 +02002958 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 character) required to convert the unicode object. Ignore size argument.
2960
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002963 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002964Py_ssize_t
2965PyUnicode_AsWideChar(PyObject *unicode,
2966 wchar_t *w,
2967 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002968{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002969 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 const wchar_t *wstr;
2971
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002972 if (unicode == NULL) {
2973 PyErr_BadInternalCall();
2974 return -1;
2975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002976 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 if (wstr == NULL)
2978 return -1;
2979
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002981 if (size > res)
2982 size = res + 1;
2983 else
2984 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002985 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002986 return res;
2987 }
2988 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002990}
2991
Victor Stinner137c34c2010-09-29 10:25:54 +00002992wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002993PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 Py_ssize_t *size)
2995{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002996 const wchar_t *wstr;
2997 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002998 Py_ssize_t buflen;
2999
3000 if (unicode == NULL) {
3001 PyErr_BadInternalCall();
3002 return NULL;
3003 }
3004
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003005 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3006 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003008 }
3009 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3010 PyErr_SetString(PyExc_ValueError,
3011 "embedded null character");
3012 return NULL;
3013 }
3014
3015 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003016 if (buffer == NULL) {
3017 PyErr_NoMemory();
3018 return NULL;
3019 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003020 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003021 if (size != NULL)
3022 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003023 return buffer;
3024}
3025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003026#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003030{
Victor Stinner8faf8212011-12-08 22:14:11 +01003031 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyErr_SetString(PyExc_ValueError,
3033 "chr() arg not in range(0x110000)");
3034 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003036
Victor Stinner985a82a2014-01-03 12:53:47 +01003037 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003041PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003046 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003047 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 Py_INCREF(obj);
3049 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 }
3051 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 /* For a Unicode subtype that's not a Unicode object,
3053 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003054 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003056 PyErr_Format(PyExc_TypeError,
3057 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003058 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003059 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003063PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003066{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003067 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 PyErr_BadInternalCall();
3072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003074
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 /* Decoding bytes objects is the most common case and should be fast */
3076 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003077 if (PyBytes_GET_SIZE(obj) == 0)
3078 _Py_RETURN_UNICODE_EMPTY();
3079 v = PyUnicode_Decode(
3080 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3081 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 return v;
3083 }
3084
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003085 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 PyErr_SetString(PyExc_TypeError,
3087 "decoding str is not supported");
3088 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003091 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3092 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3093 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003094 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003095 Py_TYPE(obj)->tp_name);
3096 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003097 }
Tim Petersced69f82003-09-16 20:30:58 +00003098
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003100 PyBuffer_Release(&buffer);
3101 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003103
Serhiy Storchaka05997252013-01-26 12:14:02 +02003104 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003105 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003106 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107}
3108
Victor Stinnerebe17e02016-10-12 13:57:45 +02003109/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3110 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3111 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003112int
3113_Py_normalize_encoding(const char *encoding,
3114 char *lower,
3115 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003117 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118 char *l;
3119 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003120 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003121
Victor Stinner942889a2016-09-05 15:40:10 -07003122 assert(encoding != NULL);
3123
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003124 e = encoding;
3125 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003126 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003127 punct = 0;
3128 while (1) {
3129 char c = *e;
3130 if (c == 0) {
3131 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003132 }
Victor Stinner942889a2016-09-05 15:40:10 -07003133
3134 if (Py_ISALNUM(c) || c == '.') {
3135 if (punct && l != lower) {
3136 if (l == l_end) {
3137 return 0;
3138 }
3139 *l++ = '_';
3140 }
3141 punct = 0;
3142
3143 if (l == l_end) {
3144 return 0;
3145 }
3146 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003149 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003150 }
Victor Stinner942889a2016-09-05 15:40:10 -07003151
3152 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003153 }
3154 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003155 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003156}
3157
Alexander Belopolsky40018472011-02-26 01:02:56 +00003158PyObject *
3159PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003160 Py_ssize_t size,
3161 const char *encoding,
3162 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003163{
3164 PyObject *buffer = NULL, *unicode;
3165 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003166 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3167
3168 if (encoding == NULL) {
3169 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3170 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003171
Fred Drakee4315f52000-05-09 19:53:39 +00003172 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003173 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3174 char *lower = buflower;
3175
3176 /* Fast paths */
3177 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3178 lower += 3;
3179 if (*lower == '_') {
3180 /* Match "utf8" and "utf_8" */
3181 lower++;
3182 }
3183
3184 if (lower[0] == '8' && lower[1] == 0) {
3185 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3186 }
3187 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3188 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3189 }
3190 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3191 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3192 }
3193 }
3194 else {
3195 if (strcmp(lower, "ascii") == 0
3196 || strcmp(lower, "us_ascii") == 0) {
3197 return PyUnicode_DecodeASCII(s, size, errors);
3198 }
Steve Dowercc16be82016-09-08 10:35:16 -07003199 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003200 else if (strcmp(lower, "mbcs") == 0) {
3201 return PyUnicode_DecodeMBCS(s, size, errors);
3202 }
3203 #endif
3204 else if (strcmp(lower, "latin1") == 0
3205 || strcmp(lower, "latin_1") == 0
3206 || strcmp(lower, "iso_8859_1") == 0
3207 || strcmp(lower, "iso8859_1") == 0) {
3208 return PyUnicode_DecodeLatin1(s, size, errors);
3209 }
3210 }
Victor Stinner37296e82010-06-10 13:36:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
3213 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003214 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003215 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003216 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003217 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 if (buffer == NULL)
3219 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003220 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 if (unicode == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003228 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 Py_DECREF(unicode);
3230 goto onError;
3231 }
3232 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003233 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003234
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 Py_XDECREF(buffer);
3237 return NULL;
3238}
3239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240PyObject *
3241PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003242 const char *encoding,
3243 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003244{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245 if (!PyUnicode_Check(unicode)) {
3246 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003247 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248 }
3249
Serhiy Storchaka00939072016-10-27 21:05:49 +03003250 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3251 "PyUnicode_AsDecodedObject() is deprecated; "
3252 "use PyCodec_Decode() to decode from str", 1) < 0)
3253 return NULL;
3254
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257
3258 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003259 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260}
3261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003264 const char *encoding,
3265 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003266{
3267 PyObject *v;
3268
3269 if (!PyUnicode_Check(unicode)) {
3270 PyErr_BadArgument();
3271 goto onError;
3272 }
3273
Serhiy Storchaka00939072016-10-27 21:05:49 +03003274 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3275 "PyUnicode_AsDecodedUnicode() is deprecated; "
3276 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3277 return NULL;
3278
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281
3282 /* Decode via the codec registry */
3283 v = PyCodec_Decode(unicode, encoding, errors);
3284 if (v == NULL)
3285 goto onError;
3286 if (!PyUnicode_Check(v)) {
3287 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003288 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3289 "use codecs.decode() to decode to arbitrary types",
3290 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003291 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 Py_DECREF(v);
3293 goto onError;
3294 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003295 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003298 return NULL;
3299}
3300
Alexander Belopolsky40018472011-02-26 01:02:56 +00003301PyObject *
3302PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003303 Py_ssize_t size,
3304 const char *encoding,
3305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003309 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3313 Py_DECREF(unicode);
3314 return v;
3315}
3316
Alexander Belopolsky40018472011-02-26 01:02:56 +00003317PyObject *
3318PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003319 const char *encoding,
3320 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003321{
3322 PyObject *v;
3323
3324 if (!PyUnicode_Check(unicode)) {
3325 PyErr_BadArgument();
3326 goto onError;
3327 }
3328
Serhiy Storchaka00939072016-10-27 21:05:49 +03003329 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3330 "PyUnicode_AsEncodedObject() is deprecated; "
3331 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3332 "or PyCodec_Encode() for generic encoding", 1) < 0)
3333 return NULL;
3334
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003337
3338 /* Encode via the codec registry */
3339 v = PyCodec_Encode(unicode, encoding, errors);
3340 if (v == NULL)
3341 goto onError;
3342 return v;
3343
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 return NULL;
3346}
3347
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003348static size_t
3349wcstombs_errorpos(const wchar_t *wstr)
3350{
3351 size_t len;
3352#if SIZEOF_WCHAR_T == 2
3353 wchar_t buf[3];
3354#else
3355 wchar_t buf[2];
3356#endif
3357 char outbuf[MB_LEN_MAX];
3358 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003359
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003360#if SIZEOF_WCHAR_T == 2
3361 buf[2] = 0;
3362#else
3363 buf[1] = 0;
3364#endif
3365 start = wstr;
3366 while (*wstr != L'\0')
3367 {
3368 previous = wstr;
3369#if SIZEOF_WCHAR_T == 2
3370 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3371 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3372 {
3373 buf[0] = wstr[0];
3374 buf[1] = wstr[1];
3375 wstr += 2;
3376 }
3377 else {
3378 buf[0] = *wstr;
3379 buf[1] = 0;
3380 wstr++;
3381 }
3382#else
3383 buf[0] = *wstr;
3384 wstr++;
3385#endif
3386 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003387 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003388 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003389 }
3390
3391 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003392 return 0;
3393}
3394
Victor Stinner1b579672011-12-17 05:47:23 +01003395static int
3396locale_error_handler(const char *errors, int *surrogateescape)
3397{
Victor Stinner50149202015-09-22 00:26:54 +02003398 _Py_error_handler error_handler = get_error_handler(errors);
3399 switch (error_handler)
3400 {
3401 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003402 *surrogateescape = 0;
3403 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003404 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003405 *surrogateescape = 1;
3406 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003407 default:
3408 PyErr_Format(PyExc_ValueError,
3409 "only 'strict' and 'surrogateescape' error handlers "
3410 "are supported, not '%s'",
3411 errors);
3412 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003413 }
Victor Stinner1b579672011-12-17 05:47:23 +01003414}
3415
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003417PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418{
3419 Py_ssize_t wlen, wlen2;
3420 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003422 PyObject *bytes, *reason, *exc;
3423 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003424 int surrogateescape;
3425
3426 if (locale_error_handler(errors, &surrogateescape) < 0)
3427 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428
3429 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3430 if (wstr == NULL)
3431 return NULL;
3432
3433 wlen2 = wcslen(wstr);
3434 if (wlen2 != wlen) {
3435 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003436 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003437 return NULL;
3438 }
3439
3440 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003441 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 char *str;
3443
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003444 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445 if (str == NULL) {
3446 if (error_pos == (size_t)-1) {
3447 PyErr_NoMemory();
3448 PyMem_Free(wstr);
3449 return NULL;
3450 }
3451 else {
3452 goto encode_error;
3453 }
3454 }
3455 PyMem_Free(wstr);
3456
3457 bytes = PyBytes_FromString(str);
3458 PyMem_Free(str);
3459 }
3460 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003461 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 size_t len, len2;
3463
3464 len = wcstombs(NULL, wstr, 0);
3465 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003466 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 goto encode_error;
3468 }
3469
3470 bytes = PyBytes_FromStringAndSize(NULL, len);
3471 if (bytes == NULL) {
3472 PyMem_Free(wstr);
3473 return NULL;
3474 }
3475
3476 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3477 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003478 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003479 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003480 goto encode_error;
3481 }
3482 PyMem_Free(wstr);
3483 }
3484 return bytes;
3485
3486encode_error:
3487 errmsg = strerror(errno);
3488 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003489
3490 if (error_pos == (size_t)-1)
3491 error_pos = wcstombs_errorpos(wstr);
3492
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003494
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003495 wstr = Py_DecodeLocale(errmsg, &errlen);
3496 if (wstr != NULL) {
3497 reason = PyUnicode_FromWideChar(wstr, errlen);
3498 PyMem_RawFree(wstr);
3499 } else {
3500 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003501 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003502
Victor Stinner2f197072011-12-17 07:08:30 +01003503 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003504 reason = PyUnicode_FromString(
3505 "wcstombs() encountered an unencodable "
3506 "wide character");
3507 if (reason == NULL)
3508 return NULL;
3509
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 Py_DECREF(reason);
3516 if (exc != NULL) {
3517 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003518 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003519 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003520 return NULL;
3521}
3522
Victor Stinnerad158722010-10-27 00:25:46 +00003523PyObject *
3524PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525{
Steve Dowercc16be82016-09-08 10:35:16 -07003526#if defined(__APPLE__)
3527 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003528#else
Victor Stinner793b5312011-04-27 00:24:21 +02003529 PyInterpreterState *interp = PyThreadState_GET()->interp;
3530 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3531 cannot use it to encode and decode filenames before it is loaded. Load
3532 the Python codec requires to encode at least its own filename. Use the C
3533 version of the locale codec until the codec registry is initialized and
3534 the Python codec is loaded.
3535
3536 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3537 cannot only rely on it: check also interp->fscodec_initialized for
3538 subinterpreters. */
3539 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003540 return PyUnicode_AsEncodedString(unicode,
3541 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003542 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
3544 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003545 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003546 }
Victor Stinnerad158722010-10-27 00:25:46 +00003547#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003548}
3549
Alexander Belopolsky40018472011-02-26 01:02:56 +00003550PyObject *
3551PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003552 const char *encoding,
3553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003556 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 if (!PyUnicode_Check(unicode)) {
3559 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 }
Fred Drakee4315f52000-05-09 19:53:39 +00003562
Victor Stinner942889a2016-09-05 15:40:10 -07003563 if (encoding == NULL) {
3564 return _PyUnicode_AsUTF8String(unicode, errors);
3565 }
3566
Fred Drakee4315f52000-05-09 19:53:39 +00003567 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569 char *lower = buflower;
3570
3571 /* Fast paths */
3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573 lower += 3;
3574 if (*lower == '_') {
3575 /* Match "utf8" and "utf_8" */
3576 lower++;
3577 }
3578
3579 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003580 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003581 }
3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3584 }
3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3587 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003588 }
Victor Stinner942889a2016-09-05 15:40:10 -07003589 else {
3590 if (strcmp(lower, "ascii") == 0
3591 || strcmp(lower, "us_ascii") == 0) {
3592 return _PyUnicode_AsASCIIString(unicode, errors);
3593 }
Steve Dowercc16be82016-09-08 10:35:16 -07003594#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003595 else if (strcmp(lower, "mbcs") == 0) {
3596 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3597 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003598#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003599 else if (strcmp(lower, "latin1") == 0 ||
3600 strcmp(lower, "latin_1") == 0 ||
3601 strcmp(lower, "iso_8859_1") == 0 ||
3602 strcmp(lower, "iso8859_1") == 0) {
3603 return _PyUnicode_AsLatin1String(unicode, errors);
3604 }
3605 }
Victor Stinner37296e82010-06-10 13:36:23 +00003606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607
3608 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003609 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003611 return NULL;
3612
3613 /* The normal path */
3614 if (PyBytes_Check(v))
3615 return v;
3616
3617 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003618 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003619 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003620 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003621
3622 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003623 "encoder %s returned bytearray instead of bytes; "
3624 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003625 encoding);
3626 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003627 Py_DECREF(v);
3628 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003631 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3632 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003633 Py_DECREF(v);
3634 return b;
3635 }
3636
3637 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003638 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3639 "use codecs.encode() to encode to arbitrary types",
3640 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003641 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003642 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003643 return NULL;
3644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003650{
3651 PyObject *v;
3652
3653 if (!PyUnicode_Check(unicode)) {
3654 PyErr_BadArgument();
3655 goto onError;
3656 }
3657
Serhiy Storchaka00939072016-10-27 21:05:49 +03003658 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3659 "PyUnicode_AsEncodedUnicode() is deprecated; "
3660 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3661 return NULL;
3662
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665
3666 /* Encode via the codec registry */
3667 v = PyCodec_Encode(unicode, encoding, errors);
3668 if (v == NULL)
3669 goto onError;
3670 if (!PyUnicode_Check(v)) {
3671 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003672 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3673 "use codecs.encode() to encode to arbitrary types",
3674 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003675 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003676 Py_DECREF(v);
3677 goto onError;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003680
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 return NULL;
3683}
3684
Victor Stinner2f197072011-12-17 07:08:30 +01003685static size_t
3686mbstowcs_errorpos(const char *str, size_t len)
3687{
3688#ifdef HAVE_MBRTOWC
3689 const char *start = str;
3690 mbstate_t mbs;
3691 size_t converted;
3692 wchar_t ch;
3693
3694 memset(&mbs, 0, sizeof mbs);
3695 while (len)
3696 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003697 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003698 if (converted == 0)
3699 /* Reached end of string */
3700 break;
3701 if (converted == (size_t)-1 || converted == (size_t)-2) {
3702 /* Conversion error or incomplete character */
3703 return str - start;
3704 }
3705 else {
3706 str += converted;
3707 len -= converted;
3708 }
3709 }
3710 /* failed to find the undecodable byte sequence */
3711 return 0;
3712#endif
3713 return 0;
3714}
3715
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003718 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
3720 wchar_t smallbuf[256];
3721 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3722 wchar_t *wstr;
3723 size_t wlen, wlen2;
3724 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003725 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003726 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003727 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003728 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003729
3730 if (locale_error_handler(errors, &surrogateescape) < 0)
3731 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003733 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3734 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003735 return NULL;
3736 }
3737
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003738 if (surrogateescape) {
3739 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003740 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wstr == NULL) {
3742 if (wlen == (size_t)-1)
3743 PyErr_NoMemory();
3744 else
3745 PyErr_SetFromErrno(PyExc_OSError);
3746 return NULL;
3747 }
3748
3749 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003750 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 }
3752 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003753 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003754#ifndef HAVE_BROKEN_MBSTOWCS
3755 wlen = mbstowcs(NULL, str, 0);
3756#else
3757 wlen = len;
3758#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003759 if (wlen == (size_t)-1)
3760 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003761 if (wlen+1 <= smallbuf_len) {
3762 wstr = smallbuf;
3763 }
3764 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003765 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003766 if (!wstr)
3767 return PyErr_NoMemory();
3768 }
3769
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 wlen2 = mbstowcs(wstr, str, wlen+1);
3771 if (wlen2 == (size_t)-1) {
3772 if (wstr != smallbuf)
3773 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003775 }
3776#ifdef HAVE_BROKEN_MBSTOWCS
3777 assert(wlen2 == wlen);
3778#endif
3779 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3780 if (wstr != smallbuf)
3781 PyMem_Free(wstr);
3782 }
3783 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003784
3785decode_error:
3786 errmsg = strerror(errno);
3787 assert(errmsg != NULL);
3788
3789 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003790 wstr = Py_DecodeLocale(errmsg, &errlen);
3791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
3793 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003794 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003795
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003810 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003846 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 }
3850 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 }
Victor Stinnerad158722010-10-27 00:25:46 +00003853#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854}
3855
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
Brett Cannonec6ce872016-09-06 15:50:29 -07003860 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003866 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003867 return 1;
3868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003871 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
Victor Stinner0ea2a462010-04-30 00:22:08 +00003885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003887 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894}
3895
3896
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
Brett Cannona5711202016-09-06 19:36:01 -07003900 int is_buffer = 0;
3901 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003905 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 return 1;
3907 }
Brett Cannona5711202016-09-06 19:36:01 -07003908
3909 is_buffer = PyObject_CheckBuffer(arg);
3910 if (!is_buffer) {
3911 path = PyOS_FSPath(arg);
3912 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003913 return 0;
3914 }
Brett Cannona5711202016-09-06 19:36:01 -07003915 }
3916 else {
3917 path = arg;
3918 Py_INCREF(arg);
3919 }
3920
3921 if (PyUnicode_Check(path)) {
3922 if (PyUnicode_READY(path) == -1) {
3923 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003925 }
3926 output = path;
3927 }
3928 else if (PyBytes_Check(path) || is_buffer) {
3929 PyObject *path_bytes = NULL;
3930
3931 if (!PyBytes_Check(path) &&
3932 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3933 "path should be string, bytes, or os.PathLike, not %.200s",
3934 Py_TYPE(arg)->tp_name)) {
3935 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003937 }
3938 path_bytes = PyBytes_FromObject(path);
3939 Py_DECREF(path);
3940 if (!path_bytes) {
3941 return 0;
3942 }
3943 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3944 PyBytes_GET_SIZE(path_bytes));
3945 Py_DECREF(path_bytes);
3946 if (!output) {
3947 return 0;
3948 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003950 else {
3951 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003952 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003954 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 return 0;
3956 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003957 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003958 Py_DECREF(output);
3959 return 0;
3960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003962 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003963 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003964 Py_DECREF(output);
3965 return 0;
3966 }
3967 *(PyObject**)addr = output;
3968 return Py_CLEANUP_SUPPORTED;
3969}
3970
3971
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003972const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003974{
Christian Heimesf3863112007-11-22 07:46:41 +00003975 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003977 if (!PyUnicode_Check(unicode)) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003981 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003984 if (PyUnicode_UTF8(unicode) == NULL) {
3985 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003986 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (bytes == NULL)
3988 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3990 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003991 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_DECREF(bytes);
3993 return NULL;
3994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003996 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 PyBytes_AS_STRING(bytes),
3998 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 Py_DECREF(bytes);
4000 }
4001
4002 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004003 *psize = PyUnicode_UTF8_LENGTH(unicode);
4004 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004005}
4006
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004007const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4011}
4012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013Py_UNICODE *
4014PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 const unsigned char *one_byte;
4017#if SIZEOF_WCHAR_T == 4
4018 const Py_UCS2 *two_bytes;
4019#else
4020 const Py_UCS4 *four_bytes;
4021 const Py_UCS4 *ucs4_end;
4022 Py_ssize_t num_surrogates;
4023#endif
4024 wchar_t *w;
4025 wchar_t *wchar_end;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return NULL;
4030 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 assert(_PyUnicode_KIND(unicode) != 0);
4034 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4039 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 num_surrogates = 0;
4041
4042 for (; four_bytes < ucs4_end; ++four_bytes) {
4043 if (*four_bytes > 0xFFFF)
4044 ++num_surrogates;
4045 }
4046
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4048 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4049 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyErr_NoMemory();
4051 return NULL;
4052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 w = _PyUnicode_WSTR(unicode);
4056 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4057 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4059 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004060 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004062 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4063 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 }
4065 else
4066 *w = *four_bytes;
4067
4068 if (w > wchar_end) {
4069 assert(0 && "Miscalculated string end");
4070 }
4071 }
4072 *w = 0;
4073#else
4074 /* sizeof(wchar_t) == 4 */
4075 Py_FatalError("Impossible unicode object state, wstr and str "
4076 "should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004081 if ((size_t)_PyUnicode_LENGTH(unicode) >
4082 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4083 PyErr_NoMemory();
4084 return NULL;
4085 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4087 (_PyUnicode_LENGTH(unicode) + 1));
4088 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 PyErr_NoMemory();
4090 return NULL;
4091 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4093 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4094 w = _PyUnicode_WSTR(unicode);
4095 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4098 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 for (; w < wchar_end; ++one_byte, ++w)
4100 *w = *one_byte;
4101 /* null-terminate the wstr */
4102 *w = 0;
4103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++two_bytes, ++w)
4108 *w = *two_bytes;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111#else
4112 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004113 PyObject_FREE(_PyUnicode_WSTR(unicode));
4114 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 Py_FatalError("Impossible unicode object state, wstr "
4116 "and str should share memory already.");
4117 return NULL;
4118#endif
4119 }
4120 else {
4121 assert(0 && "This should never happen.");
4122 }
4123 }
4124 }
4125 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004126 *size = PyUnicode_WSTR_LENGTH(unicode);
4127 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004128}
4129
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130Py_UNICODE *
4131PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134}
4135
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004136const Py_UNICODE *
4137_PyUnicode_AsUnicode(PyObject *unicode)
4138{
4139 Py_ssize_t size;
4140 const Py_UNICODE *wstr;
4141
4142 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4143 if (wstr && wcslen(wstr) != (size_t)size) {
4144 PyErr_SetString(PyExc_ValueError, "embedded null character");
4145 return NULL;
4146 }
4147 return wstr;
4148}
4149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150
Alexander Belopolsky40018472011-02-26 01:02:56 +00004151Py_ssize_t
4152PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153{
4154 if (!PyUnicode_Check(unicode)) {
4155 PyErr_BadArgument();
4156 goto onError;
4157 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004158 if (_PyUnicode_WSTR(unicode) == NULL) {
4159 if (PyUnicode_AsUnicode(unicode) == NULL)
4160 goto onError;
4161 }
4162 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 return -1;
4166}
4167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168Py_ssize_t
4169PyUnicode_GetLength(PyObject *unicode)
4170{
Victor Stinner07621332012-06-16 04:53:46 +02004171 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 PyErr_BadArgument();
4173 return -1;
4174 }
Victor Stinner07621332012-06-16 04:53:46 +02004175 if (PyUnicode_READY(unicode) == -1)
4176 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 return PyUnicode_GET_LENGTH(unicode);
4178}
4179
4180Py_UCS4
4181PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4182{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004183 void *data;
4184 int kind;
4185
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004186 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4187 PyErr_BadArgument();
4188 return (Py_UCS4)-1;
4189 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004190 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004191 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 return (Py_UCS4)-1;
4193 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004194 data = PyUnicode_DATA(unicode);
4195 kind = PyUnicode_KIND(unicode);
4196 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197}
4198
4199int
4200PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4201{
4202 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004203 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004204 return -1;
4205 }
Victor Stinner488fa492011-12-12 00:01:39 +01004206 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004207 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004208 PyErr_SetString(PyExc_IndexError, "string index out of range");
4209 return -1;
4210 }
Victor Stinner488fa492011-12-12 00:01:39 +01004211 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004212 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004213 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4214 PyErr_SetString(PyExc_ValueError, "character out of range");
4215 return -1;
4216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004217 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4218 index, ch);
4219 return 0;
4220}
4221
Alexander Belopolsky40018472011-02-26 01:02:56 +00004222const char *
4223PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004224{
Victor Stinner42cb4622010-09-01 19:39:01 +00004225 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004226}
4227
Victor Stinner554f3f02010-06-16 23:33:54 +00004228/* create or adjust a UnicodeDecodeError */
4229static void
4230make_decode_exception(PyObject **exceptionObject,
4231 const char *encoding,
4232 const char *input, Py_ssize_t length,
4233 Py_ssize_t startpos, Py_ssize_t endpos,
4234 const char *reason)
4235{
4236 if (*exceptionObject == NULL) {
4237 *exceptionObject = PyUnicodeDecodeError_Create(
4238 encoding, input, length, startpos, endpos, reason);
4239 }
4240 else {
4241 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4242 goto onError;
4243 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4244 goto onError;
4245 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4246 goto onError;
4247 }
4248 return;
4249
4250onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004251 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004252}
4253
Steve Dowercc16be82016-09-08 10:35:16 -07004254#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255/* error handling callback helper:
4256 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004257 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 and adjust various state variables.
4259 return 0 on success, -1 on error
4260*/
4261
Alexander Belopolsky40018472011-02-26 01:02:56 +00004262static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263unicode_decode_call_errorhandler_wchar(
4264 const char *errors, PyObject **errorHandler,
4265 const char *encoding, const char *reason,
4266 const char **input, const char **inend, Py_ssize_t *startinpos,
4267 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4268 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004270 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
4272 PyObject *restuple = NULL;
4273 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004274 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004276 Py_ssize_t requiredsize;
4277 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004278 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 wchar_t *repwstr;
4280 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4283 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 *errorHandler = PyCodec_LookupError(errors);
4287 if (*errorHandler == NULL)
4288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 }
4290
Victor Stinner554f3f02010-06-16 23:33:54 +00004291 make_decode_exception(exceptionObject,
4292 encoding,
4293 *input, *inend - *input,
4294 *startinpos, *endinpos,
4295 reason);
4296 if (*exceptionObject == NULL)
4297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004299 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004303 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004306 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308
4309 /* Copy back the bytes variables, which might have been modified by the
4310 callback */
4311 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4312 if (!inputobj)
4313 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 *input = PyBytes_AS_STRING(inputobj);
4315 insize = PyBytes_GET_SIZE(inputobj);
4316 *inend = *input + insize;
4317 /* we can DECREF safely, as the exception has another reference,
4318 so the object won't go away. */
4319 Py_DECREF(inputobj);
4320
4321 if (newpos<0)
4322 newpos = insize+newpos;
4323 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 goto onError;
4326 }
4327
4328 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4329 if (repwstr == NULL)
4330 goto onError;
4331 /* need more space? (at least enough for what we
4332 have+the replacement+the rest of the string (starting
4333 at the new input position), so we won't have to check space
4334 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004335 requiredsize = *outpos;
4336 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4337 goto overflow;
4338 requiredsize += repwlen;
4339 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4340 goto overflow;
4341 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004343 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 requiredsize = 2*outsize;
4345 if (unicode_resize(output, requiredsize) < 0)
4346 goto onError;
4347 }
4348 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4349 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 *endinpos = newpos;
4351 *inptr = *input + newpos;
4352
4353 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004354 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355 return 0;
4356
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004357 overflow:
4358 PyErr_SetString(PyExc_OverflowError,
4359 "decoded result is too long for a Python string");
4360
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004361 onError:
4362 Py_XDECREF(restuple);
4363 return -1;
4364}
Steve Dowercc16be82016-09-08 10:35:16 -07004365#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366
4367static int
4368unicode_decode_call_errorhandler_writer(
4369 const char *errors, PyObject **errorHandler,
4370 const char *encoding, const char *reason,
4371 const char **input, const char **inend, Py_ssize_t *startinpos,
4372 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4373 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4374{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004375 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004376
4377 PyObject *restuple = NULL;
4378 PyObject *repunicode = NULL;
4379 Py_ssize_t insize;
4380 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004381 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004382 PyObject *inputobj = NULL;
4383
4384 if (*errorHandler == NULL) {
4385 *errorHandler = PyCodec_LookupError(errors);
4386 if (*errorHandler == NULL)
4387 goto onError;
4388 }
4389
4390 make_decode_exception(exceptionObject,
4391 encoding,
4392 *input, *inend - *input,
4393 *startinpos, *endinpos,
4394 reason);
4395 if (*exceptionObject == NULL)
4396 goto onError;
4397
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004398 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 if (restuple == NULL)
4400 goto onError;
4401 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004402 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403 goto onError;
4404 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004405 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004407
4408 /* Copy back the bytes variables, which might have been modified by the
4409 callback */
4410 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411 if (!inputobj)
4412 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004413 *input = PyBytes_AS_STRING(inputobj);
4414 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004416 /* we can DECREF safely, as the exception has another reference,
4417 so the object won't go away. */
4418 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004422 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004425 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426
Victor Stinner170ca6f2013-04-18 00:25:28 +02004427 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004428 if (replen > 1) {
4429 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004430 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004431 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4432 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4433 goto onError;
4434 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004436 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004439 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004442 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448}
4449
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450/* --- UTF-7 Codec -------------------------------------------------------- */
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452/* See RFC2152 for details. We encode conservatively and decode liberally. */
4453
4454/* Three simple macros defining base-64. */
4455
4456/* Is c a base-64 character? */
4457
4458#define IS_BASE64(c) \
4459 (((c) >= 'A' && (c) <= 'Z') || \
4460 ((c) >= 'a' && (c) <= 'z') || \
4461 ((c) >= '0' && (c) <= '9') || \
4462 (c) == '+' || (c) == '/')
4463
4464/* given that c is a base-64 character, what is its base-64 value? */
4465
4466#define FROM_BASE64(c) \
4467 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4468 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4469 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4470 (c) == '+' ? 62 : 63)
4471
4472/* What is the base-64 character of the bottom 6 bits of n? */
4473
4474#define TO_BASE64(n) \
4475 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4476
4477/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4478 * decoded as itself. We are permissive on decoding; the only ASCII
4479 * byte not decoding to itself is the + which begins a base64
4480 * string. */
4481
4482#define DECODE_DIRECT(c) \
4483 ((c) <= 127 && (c) != '+')
4484
4485/* The UTF-7 encoder treats ASCII characters differently according to
4486 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4487 * the above). See RFC2152. This array identifies these different
4488 * sets:
4489 * 0 : "Set D"
4490 * alphanumeric and '(),-./:?
4491 * 1 : "Set O"
4492 * !"#$%&*;<=>@[]^_`{|}
4493 * 2 : "whitespace"
4494 * ht nl cr sp
4495 * 3 : special (must be base64 encoded)
4496 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4497 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498
Tim Petersced69f82003-09-16 20:30:58 +00004499static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500char utf7_category[128] = {
4501/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4503/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4504 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4505/* sp ! " # $ % & ' ( ) * + , - . / */
4506 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4507/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4509/* @ A B C D E F G H I J K L M N O */
4510 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4511/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4513/* ` a b c d e f g h i j k l m n o */
4514 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4515/* p q r s t u v w x y z { | } ~ del */
4516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517};
4518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519/* ENCODE_DIRECT: this character should be encoded as itself. The
4520 * answer depends on whether we are encoding set O as itself, and also
4521 * on whether we are encoding whitespace as itself. RFC2152 makes it
4522 * clear that the answers to these questions vary between
4523 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525#define ENCODE_DIRECT(c, directO, directWS) \
4526 ((c) < 128 && (c) > 0 && \
4527 ((utf7_category[(c)] == 0) || \
4528 (directWS && (utf7_category[(c)] == 2)) || \
4529 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Alexander Belopolsky40018472011-02-26 01:02:56 +00004531PyObject *
4532PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004533 Py_ssize_t size,
4534 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004536 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4537}
4538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539/* The decoder. The only state we preserve is our read position,
4540 * i.e. how many characters we have consumed. So if we end in the
4541 * middle of a shift sequence we have to back off the read position
4542 * and the output to the beginning of the sequence, otherwise we lose
4543 * all the shift state (seen bits, number of bits seen, high
4544 * surrogate). */
4545
Alexander Belopolsky40018472011-02-26 01:02:56 +00004546PyObject *
4547PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004548 Py_ssize_t size,
4549 const char *errors,
4550 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004551{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553 Py_ssize_t startinpos;
4554 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004556 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 const char *errmsg = "";
4558 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004559 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 unsigned int base64bits = 0;
4561 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 PyObject *errorHandler = NULL;
4564 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 if (size == 0) {
4567 if (consumed)
4568 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004569 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004570 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004572 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004573 _PyUnicodeWriter_Init(&writer);
4574 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575
4576 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577 e = s + size;
4578
4579 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004582 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (inShift) { /* in a base-64 section */
4585 if (IS_BASE64(ch)) { /* consume a base-64 character */
4586 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4587 base64bits += 6;
4588 s++;
4589 if (base64bits >= 16) {
4590 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004591 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 base64bits -= 16;
4593 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004594 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (surrogate) {
4596 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004597 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4598 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004599 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004600 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004602 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 }
4604 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 }
Victor Stinner551ac952011-11-29 22:58:13 +01004610 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 /* first surrogate */
4612 surrogate = outCh;
4613 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004615 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004616 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 }
4618 }
4619 }
4620 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (base64bits > 0) { /* left-over bits */
4623 if (base64bits >= 6) {
4624 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004625 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 errmsg = "partial character in shift sequence";
4627 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 else {
4630 /* Some bits remain; they should be zero */
4631 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004632 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 errmsg = "non-zero padding bits in shift sequence";
4634 goto utf7Error;
4635 }
4636 }
4637 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 if (surrogate && DECODE_DIRECT(ch)) {
4639 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4640 goto onError;
4641 }
4642 surrogate = 0;
4643 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 /* '-' is absorbed; other terminating
4645 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
4649 }
4650 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 s++; /* consume '+' */
4653 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004655 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 }
4658 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004660 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004661 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004663 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 }
4665 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004668 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 else {
4672 startinpos = s-starts;
4673 s++;
4674 errmsg = "unexpected special character";
4675 goto utf7Error;
4676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 errors, &errorHandler,
4682 "utf7", errmsg,
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 }
4687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 /* end of string */
4689
4690 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4691 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004692 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 if (surrogate ||
4694 (base64bits >= 6) ||
4695 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004697 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 errors, &errorHandler,
4699 "utf7", "unterminated shift sequence",
4700 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004701 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 goto onError;
4703 if (s < e)
4704 goto restart;
4705 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707
4708 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004711 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004712 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004713 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004714 writer.kind, writer.data, shiftOutStart);
4715 Py_XDECREF(errorHandler);
4716 Py_XDECREF(exc);
4717 _PyUnicodeWriter_Dealloc(&writer);
4718 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004719 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004720 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004721 }
4722 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004723 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004725 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 Py_XDECREF(errorHandler);
4728 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(errorHandler);
4733 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 return NULL;
4736}
4737
4738
Alexander Belopolsky40018472011-02-26 01:02:56 +00004739PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740_PyUnicode_EncodeUTF7(PyObject *str,
4741 int base64SetO,
4742 int base64WhiteSpace,
4743 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004745 int kind;
4746 void *data;
4747 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 unsigned int base64bits = 0;
4752 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753 char * out;
4754 char * start;
4755
Benjamin Petersonbac79492012-01-14 13:34:47 -05004756 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004757 return NULL;
4758 kind = PyUnicode_KIND(str);
4759 data = PyUnicode_DATA(str);
4760 len = PyUnicode_GET_LENGTH(str);
4761
4762 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004765 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004766 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004767 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004768 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769 if (v == NULL)
4770 return NULL;
4771
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004772 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004773 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004774 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 if (inShift) {
4777 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4778 /* shifting out */
4779 if (base64bits) { /* output remaining bits */
4780 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4781 base64buffer = 0;
4782 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004783 }
4784 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 /* Characters not in the BASE64 set implicitly unshift the sequence
4786 so no '-' is required, except if the character is itself a '-' */
4787 if (IS_BASE64(ch) || ch == '-') {
4788 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 *out++ = (char) ch;
4791 }
4792 else {
4793 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004794 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 else { /* not in a shift sequence */
4797 if (ch == '+') {
4798 *out++ = '+';
4799 *out++ = '-';
4800 }
4801 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4802 *out++ = (char) ch;
4803 }
4804 else {
4805 *out++ = '+';
4806 inShift = 1;
4807 goto encode_char;
4808 }
4809 }
4810 continue;
4811encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004813 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004814
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 /* code first surrogate */
4816 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004817 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004818 while (base64bits >= 6) {
4819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4820 base64bits -= 6;
4821 }
4822 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004823 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 base64bits += 16;
4826 base64buffer = (base64buffer << 16) | ch;
4827 while (base64bits >= 6) {
4828 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4829 base64bits -= 6;
4830 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004831 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 if (base64bits)
4833 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4834 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004835 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004836 if (_PyBytes_Resize(&v, out - start) < 0)
4837 return NULL;
4838 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004840PyObject *
4841PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4842 Py_ssize_t size,
4843 int base64SetO,
4844 int base64WhiteSpace,
4845 const char *errors)
4846{
4847 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004848 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004849 if (tmp == NULL)
4850 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004851 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004852 base64WhiteSpace, errors);
4853 Py_DECREF(tmp);
4854 return result;
4855}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004856
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857#undef IS_BASE64
4858#undef FROM_BASE64
4859#undef TO_BASE64
4860#undef DECODE_DIRECT
4861#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863/* --- UTF-8 Codec -------------------------------------------------------- */
4864
Alexander Belopolsky40018472011-02-26 01:02:56 +00004865PyObject *
4866PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004867 Py_ssize_t size,
4868 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Walter Dörwald69652032004-09-07 20:24:22 +00004870 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4871}
4872
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873#include "stringlib/asciilib.h"
4874#include "stringlib/codecs.h"
4875#include "stringlib/undef.h"
4876
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004877#include "stringlib/ucs1lib.h"
4878#include "stringlib/codecs.h"
4879#include "stringlib/undef.h"
4880
4881#include "stringlib/ucs2lib.h"
4882#include "stringlib/codecs.h"
4883#include "stringlib/undef.h"
4884
4885#include "stringlib/ucs4lib.h"
4886#include "stringlib/codecs.h"
4887#include "stringlib/undef.h"
4888
Antoine Pitrouab868312009-01-10 15:40:25 +00004889/* Mask to quickly check whether a C 'long' contains a
4890 non-ASCII, UTF8-encoded char. */
4891#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004892# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004893#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004894# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004895#else
4896# error C 'long' size should be either 4 or 8!
4897#endif
4898
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899static Py_ssize_t
4900ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004901{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004905 /*
4906 * Issue #17237: m68k is a bit different from most architectures in
4907 * that objects do not use "natural alignment" - for example, int and
4908 * long are only aligned at 2-byte boundaries. Therefore the assert()
4909 * won't work; also, tests have shown that skipping the "optimised
4910 * version" will even speed up m68k.
4911 */
4912#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004914 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4915 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 /* Fast path, see in STRINGLIB(utf8_decode) for
4917 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004918 /* Help allocation */
4919 const char *_p = p;
4920 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (_p < aligned_end) {
4922 unsigned long value = *(const unsigned long *) _p;
4923 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 *((unsigned long *)q) = value;
4926 _p += SIZEOF_LONG;
4927 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004928 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 p = _p;
4930 while (p < end) {
4931 if ((unsigned char)*p & 0x80)
4932 break;
4933 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004938#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 while (p < end) {
4940 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4941 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004942 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004943 /* Help allocation */
4944 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 while (_p < aligned_end) {
4946 unsigned long value = *(unsigned long *) _p;
4947 if (value & ASCII_CHAR_MASK)
4948 break;
4949 _p += SIZEOF_LONG;
4950 }
4951 p = _p;
4952 if (_p == end)
4953 break;
4954 }
4955 if ((unsigned char)*p & 0x80)
4956 break;
4957 ++p;
4958 }
4959 memcpy(dest, start, p - start);
4960 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961}
Antoine Pitrouab868312009-01-10 15:40:25 +00004962
Victor Stinner785938e2011-12-11 20:09:03 +01004963PyObject *
4964PyUnicode_DecodeUTF8Stateful(const char *s,
4965 Py_ssize_t size,
4966 const char *errors,
4967 Py_ssize_t *consumed)
4968{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004970 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972
4973 Py_ssize_t startinpos;
4974 Py_ssize_t endinpos;
4975 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004978 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004979
4980 if (size == 0) {
4981 if (consumed)
4982 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004984 }
4985
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4987 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004988 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 *consumed = 1;
4990 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004991 }
4992
Victor Stinner8f674cc2013-04-17 23:02:17 +02004993 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004994 writer.min_length = size;
4995 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004997
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 writer.pos = ascii_decode(s, end, writer.data);
4999 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 while (s < end) {
5001 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005003
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 if (PyUnicode_IS_ASCII(writer.buffer))
5006 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 } else {
5012 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 }
5015
5016 switch (ch) {
5017 case 0:
5018 if (s == end || consumed)
5019 goto End;
5020 errmsg = "unexpected end of data";
5021 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005022 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 break;
5024 case 1:
5025 errmsg = "invalid start byte";
5026 startinpos = s - starts;
5027 endinpos = startinpos + 1;
5028 break;
5029 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005030 case 3:
5031 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 errmsg = "invalid continuation byte";
5033 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005034 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035 break;
5036 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005037 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 goto onError;
5039 continue;
5040 }
5041
Victor Stinner1d65d912015-10-05 13:43:50 +02005042 if (error_handler == _Py_ERROR_UNKNOWN)
5043 error_handler = get_error_handler(errors);
5044
5045 switch (error_handler) {
5046 case _Py_ERROR_IGNORE:
5047 s += (endinpos - startinpos);
5048 break;
5049
5050 case _Py_ERROR_REPLACE:
5051 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5052 goto onError;
5053 s += (endinpos - startinpos);
5054 break;
5055
5056 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005057 {
5058 Py_ssize_t i;
5059
Victor Stinner1d65d912015-10-05 13:43:50 +02005060 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5061 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005062 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 ch = (Py_UCS4)(unsigned char)(starts[i]);
5064 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5065 ch + 0xdc00);
5066 writer.pos++;
5067 }
5068 s += (endinpos - startinpos);
5069 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005070 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005071
5072 default:
5073 if (unicode_decode_call_errorhandler_writer(
5074 errors, &error_handler_obj,
5075 "utf-8", errmsg,
5076 &starts, &end, &startinpos, &endinpos, &exc, &s,
5077 &writer))
5078 goto onError;
5079 }
Victor Stinner785938e2011-12-11 20:09:03 +01005080 }
5081
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 if (consumed)
5084 *consumed = s - starts;
5085
Victor Stinner1d65d912015-10-05 13:43:50 +02005086 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005088 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089
5090onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005091 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005093 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005095}
5096
Xavier de Gaye76febd02016-12-15 20:59:58 +01005097#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098
5099/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005100 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005101
5102 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005103 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104
5105wchar_t*
5106_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5107{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 wchar_t *unicode;
5110 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111
5112 /* Note: size will always be longer than the resulting Unicode
5113 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005114 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005116 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117 if (!unicode)
5118 return NULL;
5119
5120 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 if (ch > 0xFF) {
5131#if SIZEOF_WCHAR_T == 4
5132 assert(0);
5133#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005134 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 /* compute and append the two surrogates: */
5136 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5138#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 else {
5141 if (!ch && s == e)
5142 break;
5143 /* surrogateescape */
5144 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5145 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148 return unicode;
5149}
5150
Xavier de Gaye76febd02016-12-15 20:59:58 +01005151#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153/* Primary internal function which creates utf8 encoded bytes objects.
5154
5155 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005156 and allocate exactly as much space needed at the end. Else allocate the
5157 maximum possible needed (4 result bytes per Unicode character), and return
5158 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005159*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005160PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Victor Stinner6099a032011-12-18 14:22:26 +01005163 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 void *data;
5165 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167 if (!PyUnicode_Check(unicode)) {
5168 PyErr_BadArgument();
5169 return NULL;
5170 }
5171
5172 if (PyUnicode_READY(unicode) == -1)
5173 return NULL;
5174
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005175 if (PyUnicode_UTF8(unicode))
5176 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5177 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178
5179 kind = PyUnicode_KIND(unicode);
5180 data = PyUnicode_DATA(unicode);
5181 size = PyUnicode_GET_LENGTH(unicode);
5182
Benjamin Petersonead6b532011-12-20 17:23:42 -06005183 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005184 default:
5185 assert(0);
5186 case PyUnicode_1BYTE_KIND:
5187 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5188 assert(!PyUnicode_IS_ASCII(unicode));
5189 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_2BYTE_KIND:
5191 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_4BYTE_KIND:
5193 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5199 Py_ssize_t size,
5200 const char *errors)
5201{
5202 PyObject *v, *unicode;
5203
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005204 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 if (unicode == NULL)
5206 return NULL;
5207 v = _PyUnicode_AsUTF8String(unicode, errors);
5208 Py_DECREF(unicode);
5209 return v;
5210}
5211
5212PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005215 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216}
5217
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218/* --- UTF-32 Codec ------------------------------------------------------- */
5219
5220PyObject *
5221PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder,
5234 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235{
5236 const char *starts = s;
5237 Py_ssize_t startinpos;
5238 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005240 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005241 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005242 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 PyObject *errorHandler = NULL;
5245 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005246
Walter Dörwald41980ca2007-08-16 21:55:45 +00005247 q = (unsigned char *)s;
5248 e = q + size;
5249
5250 if (byteorder)
5251 bo = *byteorder;
5252
5253 /* Check for BOM marks (U+FEFF) in the input and adjust current
5254 byte order setting accordingly. In native mode, the leading BOM
5255 mark is skipped, in all other modes, it is copied to the output
5256 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005258 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005259 if (bom == 0x0000FEFF) {
5260 bo = -1;
5261 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005263 else if (bom == 0xFFFE0000) {
5264 bo = 1;
5265 q += 4;
5266 }
5267 if (byteorder)
5268 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269 }
5270
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 if (q == e) {
5272 if (consumed)
5273 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005274 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275 }
5276
Victor Stinnere64322e2012-10-30 23:12:47 +01005277#ifdef WORDS_BIGENDIAN
5278 le = bo < 0;
5279#else
5280 le = bo <= 0;
5281#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005282 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005283
Victor Stinner8f674cc2013-04-17 23:02:17 +02005284 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005285 writer.min_length = (e - q + 3) / 4;
5286 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005287 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005288
Victor Stinnere64322e2012-10-30 23:12:47 +01005289 while (1) {
5290 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005291 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005292
Victor Stinnere64322e2012-10-30 23:12:47 +01005293 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005294 enum PyUnicode_Kind kind = writer.kind;
5295 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005297 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (le) {
5299 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005300 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 if (ch > maxch)
5302 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 if (kind != PyUnicode_1BYTE_KIND &&
5304 Py_UNICODE_IS_SURROGATE(ch))
5305 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005307 q += 4;
5308 } while (q <= last);
5309 }
5310 else {
5311 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005312 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 if (ch > maxch)
5314 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (kind != PyUnicode_1BYTE_KIND &&
5316 Py_UNICODE_IS_SURROGATE(ch))
5317 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 q += 4;
5320 } while (q <= last);
5321 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 }
5324
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005326 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 startinpos = ((const char *)q) - starts;
5328 endinpos = startinpos + 4;
5329 }
5330 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338 else {
5339 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005340 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 goto onError;
5342 q += 4;
5343 continue;
5344 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005345 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 startinpos = ((const char *)q) - starts;
5347 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005349
5350 /* The remaining input chars are ignored if the callback
5351 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 }
5359
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 Py_XDECREF(errorHandler);
5364 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369 Py_XDECREF(errorHandler);
5370 Py_XDECREF(exc);
5371 return NULL;
5372}
5373
5374PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375_PyUnicode_EncodeUTF32(PyObject *str,
5376 const char *errors,
5377 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 enum PyUnicode_Kind kind;
5380 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005381 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005382 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005383 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005384#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005391 PyObject *errorHandler = NULL;
5392 PyObject *exc = NULL;
5393 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 if (!PyUnicode_Check(str)) {
5396 PyErr_BadArgument();
5397 return NULL;
5398 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005399 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 return NULL;
5401 kind = PyUnicode_KIND(str);
5402 data = PyUnicode_DATA(str);
5403 len = PyUnicode_GET_LENGTH(str);
5404
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005406 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005408 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409 if (v == NULL)
5410 return NULL;
5411
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 /* output buffer is 4-bytes aligned */
5413 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005414 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005416 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005421 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 else
5425 encoding = "utf-32";
5426
5427 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5429 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 }
5431
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005432 pos = 0;
5433 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005435
5436 if (kind == PyUnicode_2BYTE_KIND) {
5437 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5438 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 else {
5441 assert(kind == PyUnicode_4BYTE_KIND);
5442 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5443 &out, native_ordering);
5444 }
5445 if (pos == len)
5446 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005447
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005448 rep = unicode_encode_call_errorhandler(
5449 errors, &errorHandler,
5450 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005451 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 if (!rep)
5453 goto error;
5454
5455 if (PyBytes_Check(rep)) {
5456 repsize = PyBytes_GET_SIZE(rep);
5457 if (repsize & 3) {
5458 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 "surrogates not allowed");
5461 goto error;
5462 }
5463 moreunits = repsize / 4;
5464 }
5465 else {
5466 assert(PyUnicode_Check(rep));
5467 if (PyUnicode_READY(rep) < 0)
5468 goto error;
5469 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5470 if (!PyUnicode_IS_ASCII(rep)) {
5471 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005472 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 "surrogates not allowed");
5474 goto error;
5475 }
5476 }
5477
5478 /* four bytes are reserved for each surrogate */
5479 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005480 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005481 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 /* integer overflow */
5483 PyErr_NoMemory();
5484 goto error;
5485 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005486 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005488 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 }
5490
5491 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005492 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5497 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 }
5499
5500 Py_CLEAR(rep);
5501 }
5502
5503 /* Cut back to size actually needed. This is necessary for, for example,
5504 encoding of a string containing isolated surrogates and the 'ignore'
5505 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005506 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 if (nsize != PyBytes_GET_SIZE(v))
5508 _PyBytes_Resize(&v, nsize);
5509 Py_XDECREF(errorHandler);
5510 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005511 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005512 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005513 error:
5514 Py_XDECREF(rep);
5515 Py_XDECREF(errorHandler);
5516 Py_XDECREF(exc);
5517 Py_XDECREF(v);
5518 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005519}
5520
Alexander Belopolsky40018472011-02-26 01:02:56 +00005521PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005522PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5523 Py_ssize_t size,
5524 const char *errors,
5525 int byteorder)
5526{
5527 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005528 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005529 if (tmp == NULL)
5530 return NULL;
5531 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5532 Py_DECREF(tmp);
5533 return result;
5534}
5535
5536PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005537PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538{
Victor Stinnerb960b342011-11-20 19:12:52 +01005539 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540}
5541
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542/* --- UTF-16 Codec ------------------------------------------------------- */
5543
Tim Peters772747b2001-08-09 22:21:55 +00005544PyObject *
5545PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 Py_ssize_t size,
5547 const char *errors,
5548 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
Walter Dörwald69652032004-09-07 20:24:22 +00005550 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5551}
5552
5553PyObject *
5554PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 Py_ssize_t size,
5556 const char *errors,
5557 int *byteorder,
5558 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t startinpos;
5562 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005563 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005564 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005565 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005567 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 PyObject *errorHandler = NULL;
5569 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005570 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Tim Peters772747b2001-08-09 22:21:55 +00005572 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574
5575 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005576 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005578 /* Check for BOM marks (U+FEFF) in the input and adjust current
5579 byte order setting accordingly. In native mode, the leading BOM
5580 mark is skipped, in all other modes, it is copied to the output
5581 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005582 if (bo == 0 && size >= 2) {
5583 const Py_UCS4 bom = (q[1] << 8) | q[0];
5584 if (bom == 0xFEFF) {
5585 q += 2;
5586 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005588 else if (bom == 0xFFFE) {
5589 q += 2;
5590 bo = 1;
5591 }
5592 if (byteorder)
5593 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 if (q == e) {
5597 if (consumed)
5598 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005599 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005600 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601
Christian Heimes743e0cd2012-10-17 23:52:17 +02005602#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005605#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005607 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005608#endif
Tim Peters772747b2001-08-09 22:21:55 +00005609
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610 /* Note: size will always be longer than the resulting Unicode
5611 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005612 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005613 writer.min_length = (e - q + 1) / 2;
5614 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617 while (1) {
5618 Py_UCS4 ch = 0;
5619 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005620 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005621 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 native_ordering);
5626 else
5627 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 native_ordering);
5630 } else if (kind == PyUnicode_2BYTE_KIND) {
5631 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005633 native_ordering);
5634 } else {
5635 assert(kind == PyUnicode_4BYTE_KIND);
5636 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005639 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 switch (ch)
5643 {
5644 case 0:
5645 /* remaining byte at the end? (size should be even) */
5646 if (q == e || consumed)
5647 goto End;
5648 errmsg = "truncated data";
5649 startinpos = ((const char *)q) - starts;
5650 endinpos = ((const char *)e) - starts;
5651 break;
5652 /* The remaining input chars are ignored if the callback
5653 chooses to skip the input */
5654 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005655 q -= 2;
5656 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005657 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005658 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005659 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005660 endinpos = ((const char *)e) - starts;
5661 break;
5662 case 2:
5663 errmsg = "illegal encoding";
5664 startinpos = ((const char *)q) - 2 - starts;
5665 endinpos = startinpos + 2;
5666 break;
5667 case 3:
5668 errmsg = "illegal UTF-16 surrogate";
5669 startinpos = ((const char *)q) - 4 - starts;
5670 endinpos = startinpos + 2;
5671 break;
5672 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005673 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 continue;
5676 }
5677
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005679 errors,
5680 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005681 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005682 &starts,
5683 (const char **)&e,
5684 &startinpos,
5685 &endinpos,
5686 &exc,
5687 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 }
5691
Antoine Pitrou63065d72012-05-15 23:48:04 +02005692End:
Walter Dörwald69652032004-09-07 20:24:22 +00005693 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 Py_XDECREF(errorHandler);
5697 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 Py_XDECREF(errorHandler);
5703 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 return NULL;
5705}
5706
Tim Peters772747b2001-08-09 22:21:55 +00005707PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005708_PyUnicode_EncodeUTF16(PyObject *str,
5709 const char *errors,
5710 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005712 enum PyUnicode_Kind kind;
5713 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005714 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005715 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005716 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005718#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005719 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005720#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005722#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005723 const char *encoding;
5724 Py_ssize_t nsize, pos;
5725 PyObject *errorHandler = NULL;
5726 PyObject *exc = NULL;
5727 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 if (!PyUnicode_Check(str)) {
5730 PyErr_BadArgument();
5731 return NULL;
5732 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005733 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 return NULL;
5735 kind = PyUnicode_KIND(str);
5736 data = PyUnicode_DATA(str);
5737 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005738
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005739 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005740 if (kind == PyUnicode_4BYTE_KIND) {
5741 const Py_UCS4 *in = (const Py_UCS4 *)data;
5742 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005743 while (in < end) {
5744 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005745 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 }
5747 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005749 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 nsize = len + pairs + (byteorder == 0);
5753 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005758 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005759 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005760 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005761 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005762 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 }
5764 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005765 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 }
Tim Peters772747b2001-08-09 22:21:55 +00005767
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 if (kind == PyUnicode_1BYTE_KIND) {
5769 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5770 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005771 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005772
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005773 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005775 }
5776 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005778 }
5779 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005781 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005782
5783 pos = 0;
5784 while (pos < len) {
5785 Py_ssize_t repsize, moreunits;
5786
5787 if (kind == PyUnicode_2BYTE_KIND) {
5788 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5789 &out, native_ordering);
5790 }
5791 else {
5792 assert(kind == PyUnicode_4BYTE_KIND);
5793 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5794 &out, native_ordering);
5795 }
5796 if (pos == len)
5797 break;
5798
5799 rep = unicode_encode_call_errorhandler(
5800 errors, &errorHandler,
5801 encoding, "surrogates not allowed",
5802 str, &exc, pos, pos + 1, &pos);
5803 if (!rep)
5804 goto error;
5805
5806 if (PyBytes_Check(rep)) {
5807 repsize = PyBytes_GET_SIZE(rep);
5808 if (repsize & 1) {
5809 raise_encode_exception(&exc, encoding,
5810 str, pos - 1, pos,
5811 "surrogates not allowed");
5812 goto error;
5813 }
5814 moreunits = repsize / 2;
5815 }
5816 else {
5817 assert(PyUnicode_Check(rep));
5818 if (PyUnicode_READY(rep) < 0)
5819 goto error;
5820 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5821 if (!PyUnicode_IS_ASCII(rep)) {
5822 raise_encode_exception(&exc, encoding,
5823 str, pos - 1, pos,
5824 "surrogates not allowed");
5825 goto error;
5826 }
5827 }
5828
5829 /* two bytes are reserved for each surrogate */
5830 if (moreunits > 1) {
5831 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005832 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 /* integer overflow */
5834 PyErr_NoMemory();
5835 goto error;
5836 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005837 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 goto error;
5839 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5840 }
5841
5842 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005843 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 out += moreunits;
5845 } else /* rep is unicode */ {
5846 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5847 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5848 &out, native_ordering);
5849 }
5850
5851 Py_CLEAR(rep);
5852 }
5853
5854 /* Cut back to size actually needed. This is necessary for, for example,
5855 encoding of a string containing isolated surrogates and the 'ignore' handler
5856 is used. */
5857 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5858 if (nsize != PyBytes_GET_SIZE(v))
5859 _PyBytes_Resize(&v, nsize);
5860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005862 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005863 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005864 error:
5865 Py_XDECREF(rep);
5866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
5868 Py_XDECREF(v);
5869 return NULL;
5870#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
Alexander Belopolsky40018472011-02-26 01:02:56 +00005873PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5875 Py_ssize_t size,
5876 const char *errors,
5877 int byteorder)
5878{
5879 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005880 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 if (tmp == NULL)
5882 return NULL;
5883 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5884 Py_DECREF(tmp);
5885 return result;
5886}
5887
5888PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892}
5893
5894/* --- Unicode Escape Codec ----------------------------------------------- */
5895
Fredrik Lundh06d12682001-01-24 07:59:11 +00005896static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005897
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005899_PyUnicode_DecodeUnicodeEscape(const char *s,
5900 Py_ssize_t size,
5901 const char *errors,
5902 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 PyObject *errorHandler = NULL;
5908 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005909
Eric V. Smith42454af2016-10-31 09:22:08 -04005910 // so we can remember if we've seen an invalid escape char or not
5911 *first_invalid_escape = NULL;
5912
Victor Stinner62ec3312016-09-06 17:04:34 -07005913 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005915 }
5916 /* Escaped strings will always be longer than the resulting
5917 Unicode string, so we start with size here and then reduce the
5918 length after conversion to the true value.
5919 (but if the error callback returns a long replacement string
5920 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005921 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005922 writer.min_length = size;
5923 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5924 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005925 }
5926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 end = s + size;
5928 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005929 unsigned char c = (unsigned char) *s++;
5930 Py_UCS4 ch;
5931 int count;
5932 Py_ssize_t startinpos;
5933 Py_ssize_t endinpos;
5934 const char *message;
5935
5936#define WRITE_ASCII_CHAR(ch) \
5937 do { \
5938 assert(ch <= 127); \
5939 assert(writer.pos < writer.size); \
5940 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5941 } while(0)
5942
5943#define WRITE_CHAR(ch) \
5944 do { \
5945 if (ch <= writer.maxchar) { \
5946 assert(writer.pos < writer.size); \
5947 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5948 } \
5949 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5950 goto onError; \
5951 } \
5952 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005955 if (c != '\\') {
5956 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 continue;
5958 }
5959
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005962 if (s >= end) {
5963 message = "\\ at end of string";
5964 goto error;
5965 }
5966 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005967
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005969 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005972 case '\n': continue;
5973 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5974 case '\'': WRITE_ASCII_CHAR('\''); continue;
5975 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5976 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5979 case 't': WRITE_ASCII_CHAR('\t'); continue;
5980 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5981 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005984 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 case '0': case '1': case '2': case '3':
5989 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005990 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005991 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 ch = (ch<<3) + *s++ - '0';
5993 if (s < end && '0' <= *s && *s <= '7') {
5994 ch = (ch<<3) + *s++ - '0';
5995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005997 WRITE_CHAR(ch);
5998 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* hex escapes */
6001 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006004 message = "truncated \\xXX escape";
6005 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006010 message = "truncated \\uXXXX escape";
6011 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006014 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006015 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006016 message = "truncated \\UXXXXXXXX escape";
6017 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006019 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 ch <<= 4;
6021 if (c >= '0' && c <= '9') {
6022 ch += c - '0';
6023 }
6024 else if (c >= 'a' && c <= 'f') {
6025 ch += c - ('a' - 10);
6026 }
6027 else if (c >= 'A' && c <= 'F') {
6028 ch += c - ('A' - 10);
6029 }
6030 else {
6031 break;
6032 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006033 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006034 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006035 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 }
6037
6038 /* when we get here, ch is a 32-bit unicode character */
6039 if (ch > MAX_UNICODE) {
6040 message = "illegal Unicode character";
6041 goto error;
6042 }
6043
6044 WRITE_CHAR(ch);
6045 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006046
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 if (ucnhash_CAPI == NULL) {
6050 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006051 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6052 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006053 if (ucnhash_CAPI == NULL) {
6054 PyErr_SetString(
6055 PyExc_UnicodeError,
6056 "\\N escapes not supported (can't load unicodedata module)"
6057 );
6058 goto onError;
6059 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006061
6062 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 const char *start = ++s;
6065 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006067 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 namelen = s - start;
6070 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 ch = 0xffffffff; /* in case 'getcode' messes up */
6074 if (namelen <= INT_MAX &&
6075 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6076 &ch, 0)) {
6077 assert(ch <= MAX_UNICODE);
6078 WRITE_CHAR(ch);
6079 continue;
6080 }
6081 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 }
6083 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006084 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085
6086 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006087 if (*first_invalid_escape == NULL) {
6088 *first_invalid_escape = s-1; /* Back up one char, since we've
6089 already incremented s. */
6090 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006091 WRITE_ASCII_CHAR('\\');
6092 WRITE_CHAR(c);
6093 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095
6096 error:
6097 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006098 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006099 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006100 errors, &errorHandler,
6101 "unicodeescape", message,
6102 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006103 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 }
6106 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6107 goto onError;
6108 }
6109
6110#undef WRITE_ASCII_CHAR
6111#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006113
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006114 Py_XDECREF(errorHandler);
6115 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006116 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006117
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return NULL;
6123}
6124
Eric V. Smith42454af2016-10-31 09:22:08 -04006125PyObject *
6126PyUnicode_DecodeUnicodeEscape(const char *s,
6127 Py_ssize_t size,
6128 const char *errors)
6129{
6130 const char *first_invalid_escape;
6131 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6132 &first_invalid_escape);
6133 if (result == NULL)
6134 return NULL;
6135 if (first_invalid_escape != NULL) {
6136 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6137 "invalid escape sequence '\\%c'",
6138 *first_invalid_escape) < 0) {
6139 Py_DECREF(result);
6140 return NULL;
6141 }
6142 }
6143 return result;
6144}
6145
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006146/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
Alexander Belopolsky40018472011-02-26 01:02:56 +00006148PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006152 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Ezio Melottie7f90372012-10-05 03:33:31 +03006158 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006159 escape.
6160
Ezio Melottie7f90372012-10-05 03:33:31 +03006161 For UCS1 strings it's '\xxx', 4 bytes per source character.
6162 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6163 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006164 */
6165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 if (!PyUnicode_Check(unicode)) {
6167 PyErr_BadArgument();
6168 return NULL;
6169 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006170 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 }
Victor Stinner358af132015-10-12 22:36:57 +02006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 if (len == 0) {
6176 return PyBytes_FromStringAndSize(NULL, 0);
6177 }
6178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 kind = PyUnicode_KIND(unicode);
6180 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6182 bytes, and 1 byte characters 4. */
6183 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006184 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006185 return PyErr_NoMemory();
6186 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006187 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 if (repr == NULL) {
6189 return NULL;
6190 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006194 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006195
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 /* U+0000-U+00ff range */
6197 if (ch < 0x100) {
6198 if (ch >= ' ' && ch < 127) {
6199 if (ch != '\\') {
6200 /* Copy printable US ASCII as-is */
6201 *p++ = (char) ch;
6202 }
6203 /* Escape backslashes */
6204 else {
6205 *p++ = '\\';
6206 *p++ = '\\';
6207 }
6208 }
Victor Stinner358af132015-10-12 22:36:57 +02006209
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 /* Map special whitespace to '\t', \n', '\r' */
6211 else if (ch == '\t') {
6212 *p++ = '\\';
6213 *p++ = 't';
6214 }
6215 else if (ch == '\n') {
6216 *p++ = '\\';
6217 *p++ = 'n';
6218 }
6219 else if (ch == '\r') {
6220 *p++ = '\\';
6221 *p++ = 'r';
6222 }
6223
6224 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6225 else {
6226 *p++ = '\\';
6227 *p++ = 'x';
6228 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6229 *p++ = Py_hexdigits[ch & 0x000F];
6230 }
Tim Petersced69f82003-09-16 20:30:58 +00006231 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006232 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 *p++ = '\\';
6235 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006236 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6237 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6239 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6242 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006243
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 /* Make sure that the first two digits are zero */
6245 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006246 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 *p++ = 'U';
6248 *p++ = '0';
6249 *p++ = '0';
6250 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6251 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6255 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 assert(p - PyBytes_AS_STRING(repr) > 0);
6260 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6261 return NULL;
6262 }
6263 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264}
6265
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006267PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006270 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006271 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 }
6275
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006276 result = PyUnicode_AsUnicodeEscapeString(tmp);
6277 Py_DECREF(tmp);
6278 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279}
6280
6281/* --- Raw Unicode Escape Codec ------------------------------------------- */
6282
Alexander Belopolsky40018472011-02-26 01:02:56 +00006283PyObject *
6284PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006285 Py_ssize_t size,
6286 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006289 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 PyObject *errorHandler = NULL;
6292 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006295 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 /* Escaped strings will always be longer than the resulting
6299 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 length after conversion to the true value. (But decoding error
6301 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006302 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 writer.min_length = size;
6304 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6305 goto onError;
6306 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006307
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 end = s + size;
6309 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 unsigned char c = (unsigned char) *s++;
6311 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006312 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 Py_ssize_t startinpos;
6314 Py_ssize_t endinpos;
6315 const char *message;
6316
6317#define WRITE_CHAR(ch) \
6318 do { \
6319 if (ch <= writer.maxchar) { \
6320 assert(writer.pos < writer.size); \
6321 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6322 } \
6323 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6324 goto onError; \
6325 } \
6326 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 if (c != '\\' || s >= end) {
6330 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006332 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006333
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 c = (unsigned char) *s++;
6335 if (c == 'u') {
6336 count = 4;
6337 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 else if (c == 'U') {
6340 count = 8;
6341 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006342 }
6343 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 assert(writer.pos < writer.size);
6345 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6346 WRITE_CHAR(c);
6347 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006348 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 startinpos = s - starts - 2;
6350
6351 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6352 for (ch = 0; count && s < end; ++s, --count) {
6353 c = (unsigned char)*s;
6354 ch <<= 4;
6355 if (c >= '0' && c <= '9') {
6356 ch += c - '0';
6357 }
6358 else if (c >= 'a' && c <= 'f') {
6359 ch += c - ('a' - 10);
6360 }
6361 else if (c >= 'A' && c <= 'F') {
6362 ch += c - ('A' - 10);
6363 }
6364 else {
6365 break;
6366 }
6367 }
6368 if (!count) {
6369 if (ch <= MAX_UNICODE) {
6370 WRITE_CHAR(ch);
6371 continue;
6372 }
6373 message = "\\Uxxxxxxxx out of range";
6374 }
6375
6376 endinpos = s-starts;
6377 writer.min_length = end - s + writer.pos;
6378 if (unicode_decode_call_errorhandler_writer(
6379 errors, &errorHandler,
6380 "rawunicodeescape", message,
6381 &starts, &end, &startinpos, &endinpos, &exc, &s,
6382 &writer)) {
6383 goto onError;
6384 }
6385 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6386 goto onError;
6387 }
6388
6389#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 Py_XDECREF(errorHandler);
6392 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006393 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006394
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006396 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 Py_XDECREF(errorHandler);
6398 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401}
6402
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403
Alexander Belopolsky40018472011-02-26 01:02:56 +00006404PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 int kind;
6411 void *data;
6412 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 if (!PyUnicode_Check(unicode)) {
6415 PyErr_BadArgument();
6416 return NULL;
6417 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 kind = PyUnicode_KIND(unicode);
6422 data = PyUnicode_DATA(unicode);
6423 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (kind == PyUnicode_1BYTE_KIND) {
6425 return PyBytes_FromStringAndSize(data, len);
6426 }
Victor Stinner0e368262011-11-10 20:12:49 +01006427
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6429 bytes, and 1 byte characters 4. */
6430 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006431
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 if (len > PY_SSIZE_T_MAX / expandsize) {
6433 return PyErr_NoMemory();
6434 }
6435 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6436 if (repr == NULL) {
6437 return NULL;
6438 }
6439 if (len == 0) {
6440 return repr;
6441 }
6442
6443 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 for (pos = 0; pos < len; pos++) {
6445 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006446
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6448 if (ch < 0x100) {
6449 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006450 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6452 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 *p++ = '\\';
6454 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006455 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6456 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6458 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6461 else {
6462 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6463 *p++ = '\\';
6464 *p++ = 'U';
6465 *p++ = '0';
6466 *p++ = '0';
6467 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6468 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6472 *p++ = Py_hexdigits[ch & 15];
6473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006475
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 assert(p > PyBytes_AS_STRING(repr));
6477 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6478 return NULL;
6479 }
6480 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481}
6482
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006484PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006487 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006488 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006490 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6492 Py_DECREF(tmp);
6493 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006496/* --- Unicode Internal Codec ------------------------------------------- */
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
6499_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006500 Py_ssize_t size,
6501 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006502{
6503 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006504 Py_ssize_t startinpos;
6505 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006506 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507 const char *end;
6508 const char *reason;
6509 PyObject *errorHandler = NULL;
6510 PyObject *exc = NULL;
6511
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006512 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006513 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006514 1))
6515 return NULL;
6516
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006517 if (size < 0) {
6518 PyErr_BadInternalCall();
6519 return NULL;
6520 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006521 if (size == 0)
6522 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006523
Victor Stinner8f674cc2013-04-17 23:02:17 +02006524 _PyUnicodeWriter_Init(&writer);
6525 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6526 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006528 }
6529 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006530
Victor Stinner8f674cc2013-04-17 23:02:17 +02006531 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006534 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006535 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006536 endinpos = end-starts;
6537 reason = "truncated input";
6538 goto error;
6539 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006540 /* We copy the raw representation one byte at a time because the
6541 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006542 ((char *) &uch)[0] = s[0];
6543 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006544#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006545 ((char *) &uch)[2] = s[2];
6546 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006547#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006548 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006549#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006550 /* We have to sanity check the raw data, otherwise doom looms for
6551 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006552 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006553 endinpos = s - starts + Py_UNICODE_SIZE;
6554 reason = "illegal code point (> 0x10FFFF)";
6555 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006557#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558 s += Py_UNICODE_SIZE;
6559#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006560 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006562 Py_UNICODE uch2;
6563 ((char *) &uch2)[0] = s[0];
6564 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006565 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 {
Victor Stinner551ac952011-11-29 22:58:13 +01006567 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006568 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006569 }
6570 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006571#endif
6572
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006573 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006574 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006575 continue;
6576
6577 error:
6578 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006579 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006580 errors, &errorHandler,
6581 "unicode_internal", reason,
6582 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006583 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006584 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006585 }
6586
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006587 Py_XDECREF(errorHandler);
6588 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006589 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006592 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006593 Py_XDECREF(errorHandler);
6594 Py_XDECREF(exc);
6595 return NULL;
6596}
6597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598/* --- Latin-1 Codec ------------------------------------------------------ */
6599
Alexander Belopolsky40018472011-02-26 01:02:56 +00006600PyObject *
6601PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006602 Py_ssize_t size,
6603 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006606 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607}
6608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610static void
6611make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006612 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006613 PyObject *unicode,
6614 Py_ssize_t startpos, Py_ssize_t endpos,
6615 const char *reason)
6616{
6617 if (*exceptionObject == NULL) {
6618 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 encoding, unicode, startpos, endpos, reason);
6621 }
6622 else {
6623 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6624 goto onError;
6625 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6626 goto onError;
6627 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6628 goto onError;
6629 return;
6630 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006631 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006632 }
6633}
6634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006636static void
6637raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006638 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006639 PyObject *unicode,
6640 Py_ssize_t startpos, Py_ssize_t endpos,
6641 const char *reason)
6642{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006643 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006644 encoding, unicode, startpos, endpos, reason);
6645 if (*exceptionObject != NULL)
6646 PyCodec_StrictErrors(*exceptionObject);
6647}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648
6649/* error handling callback helper:
6650 build arguments, call the callback and check the arguments,
6651 put the result into newpos and return the replacement string, which
6652 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653static PyObject *
6654unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006655 PyObject **errorHandler,
6656 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006658 Py_ssize_t startpos, Py_ssize_t endpos,
6659 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006661 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 PyObject *restuple;
6664 PyObject *resunicode;
6665
6666 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 }
6671
Benjamin Petersonbac79492012-01-14 13:34:47 -05006672 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 return NULL;
6674 len = PyUnicode_GET_LENGTH(unicode);
6675
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006676 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006681 restuple = PyObject_CallFunctionObjArgs(
6682 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006686 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 Py_DECREF(restuple);
6688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006690 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 &resunicode, newpos)) {
6692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6696 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6697 Py_DECREF(restuple);
6698 return NULL;
6699 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701 *newpos = len + *newpos;
6702 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006703 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 Py_DECREF(restuple);
6705 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 Py_INCREF(resunicode);
6708 Py_DECREF(restuple);
6709 return resunicode;
6710}
6711
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006714 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006715 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 /* input state */
6718 Py_ssize_t pos=0, size;
6719 int kind;
6720 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 /* pointer into the output */
6722 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006723 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6724 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006725 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006727 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006728 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006729 /* output object */
6730 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731
Benjamin Petersonbac79492012-01-14 13:34:47 -05006732 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 return NULL;
6734 size = PyUnicode_GET_LENGTH(unicode);
6735 kind = PyUnicode_KIND(unicode);
6736 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 /* allocate enough for a simple encoding without
6738 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006739 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006740 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006741
6742 _PyBytesWriter_Init(&writer);
6743 str = _PyBytesWriter_Alloc(&writer, size);
6744 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006745 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006747 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006748 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006749
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006751 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006754 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006755 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006757 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006760 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006762
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006763 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006765
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006766 /* Only overallocate the buffer if it's not the last write */
6767 writer.overallocate = (collend < size);
6768
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006770 if (error_handler == _Py_ERROR_UNKNOWN)
6771 error_handler = get_error_handler(errors);
6772
6773 switch (error_handler) {
6774 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006775 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006777
6778 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006779 memset(str, '?', collend - collstart);
6780 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006781 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006782 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 break;
Victor Stinner50149202015-09-22 00:26:54 +02006785
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006786 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006787 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006788 writer.min_size -= (collend - collstart);
6789 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006790 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006791 if (str == NULL)
6792 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006793 pos = collend;
6794 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006795
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006796 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006797 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006798 writer.min_size -= (collend - collstart);
6799 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006800 unicode, collstart, collend);
6801 if (str == NULL)
6802 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006803 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 break;
Victor Stinner50149202015-09-22 00:26:54 +02006805
Victor Stinnerc3713e92015-09-29 12:32:13 +02006806 case _Py_ERROR_SURROGATEESCAPE:
6807 for (i = collstart; i < collend; ++i) {
6808 ch = PyUnicode_READ(kind, data, i);
6809 if (ch < 0xdc80 || 0xdcff < ch) {
6810 /* Not a UTF-8b surrogate */
6811 break;
6812 }
6813 *str++ = (char)(ch - 0xdc00);
6814 ++pos;
6815 }
6816 if (i >= collend)
6817 break;
6818 collstart = pos;
6819 assert(collstart != collend);
6820 /* fallback to general error handling */
6821
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006823 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6824 encoding, reason, unicode, &exc,
6825 collstart, collend, &newpos);
6826 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006828
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006829 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006830 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006831
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006833 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006834 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006835 PyBytes_AS_STRING(rep),
6836 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006837 if (str == NULL)
6838 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006839 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 else {
6841 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006842
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006846 if (limit == 256 ?
6847 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6848 !PyUnicode_IS_ASCII(rep))
6849 {
6850 /* Not all characters are smaller than limit */
6851 raise_encode_exception(&exc, encoding, unicode,
6852 collstart, collend, reason);
6853 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006855 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6856 str = _PyBytesWriter_WriteBytes(&writer, str,
6857 PyUnicode_DATA(rep),
6858 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006861 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863
6864 /* If overallocation was disabled, ensure that it was the last
6865 write. Otherwise, we missed an optimization */
6866 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
6868 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006869
Victor Stinner50149202015-09-22 00:26:54 +02006870 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006872 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006873
6874 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006875 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006876 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006877 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878 Py_XDECREF(exc);
6879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880}
6881
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883PyObject *
6884PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006885 Py_ssize_t size,
6886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006889 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 if (unicode == NULL)
6891 return NULL;
6892 result = unicode_encode_ucs1(unicode, errors, 256);
6893 Py_DECREF(unicode);
6894 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895}
6896
Alexander Belopolsky40018472011-02-26 01:02:56 +00006897PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006898_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
6900 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 PyErr_BadArgument();
6902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904 if (PyUnicode_READY(unicode) == -1)
6905 return NULL;
6906 /* Fast path: if it is a one-byte string, construct
6907 bytes object directly. */
6908 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6909 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6910 PyUnicode_GET_LENGTH(unicode));
6911 /* Non-Latin-1 characters present. Defer to above function to
6912 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914}
6915
6916PyObject*
6917PyUnicode_AsLatin1String(PyObject *unicode)
6918{
6919 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
6922/* --- 7-bit ASCII Codec -------------------------------------------------- */
6923
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_DecodeASCII(const char *s,
6926 Py_ssize_t size,
6927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006931 int kind;
6932 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t startinpos;
6934 Py_ssize_t endinpos;
6935 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006942 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006945 if (size == 1 && (unsigned char)s[0] < 128)
6946 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006947
Victor Stinner8f674cc2013-04-17 23:02:17 +02006948 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006949 writer.min_length = size;
6950 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006951 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006954 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006955 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006956 writer.pos = outpos;
6957 if (writer.pos == size)
6958 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006959
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 s += writer.pos;
6961 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006963 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 PyUnicode_WRITE(kind, data, writer.pos, c);
6966 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006968 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970
6971 /* byte outsize range 0x00..0x7f: call the error handler */
6972
6973 if (error_handler == _Py_ERROR_UNKNOWN)
6974 error_handler = get_error_handler(errors);
6975
6976 switch (error_handler)
6977 {
6978 case _Py_ERROR_REPLACE:
6979 case _Py_ERROR_SURROGATEESCAPE:
6980 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006981 but we may switch to UCS2 at the first write */
6982 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6983 goto onError;
6984 kind = writer.kind;
6985 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006986
6987 if (error_handler == _Py_ERROR_REPLACE)
6988 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6989 else
6990 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6991 writer.pos++;
6992 ++s;
6993 break;
6994
6995 case _Py_ERROR_IGNORE:
6996 ++s;
6997 break;
6998
6999 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 startinpos = s-starts;
7001 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007003 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 "ascii", "ordinal not in range(128)",
7005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 kind = writer.kind;
7009 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007012 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007015
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 return NULL;
7021}
7022
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024PyObject *
7025PyUnicode_EncodeASCII(const Py_UNICODE *p,
7026 Py_ssize_t size,
7027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007030 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 if (unicode == NULL)
7032 return NULL;
7033 result = unicode_encode_ucs1(unicode, errors, 128);
7034 Py_DECREF(unicode);
7035 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
Alexander Belopolsky40018472011-02-26 01:02:56 +00007038PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007039_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040{
7041 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 PyErr_BadArgument();
7043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045 if (PyUnicode_READY(unicode) == -1)
7046 return NULL;
7047 /* Fast path: if it is an ASCII-only string, construct bytes object
7048 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007049 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7051 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007052 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007053}
7054
7055PyObject *
7056PyUnicode_AsASCIIString(PyObject *unicode)
7057{
7058 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059}
7060
Steve Dowercc16be82016-09-08 10:35:16 -07007061#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007062
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007063/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007065#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066#define NEED_RETRY
7067#endif
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069#ifndef WC_ERR_INVALID_CHARS
7070# define WC_ERR_INVALID_CHARS 0x0080
7071#endif
7072
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007073static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007074code_page_name(UINT code_page, PyObject **obj)
7075{
7076 *obj = NULL;
7077 if (code_page == CP_ACP)
7078 return "mbcs";
7079 if (code_page == CP_UTF7)
7080 return "CP_UTF7";
7081 if (code_page == CP_UTF8)
7082 return "CP_UTF8";
7083
7084 *obj = PyBytes_FromFormat("cp%u", code_page);
7085 if (*obj == NULL)
7086 return NULL;
7087 return PyBytes_AS_STRING(*obj);
7088}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090static DWORD
7091decode_code_page_flags(UINT code_page)
7092{
7093 if (code_page == CP_UTF7) {
7094 /* The CP_UTF7 decoder only supports flags=0 */
7095 return 0;
7096 }
7097 else
7098 return MB_ERR_INVALID_CHARS;
7099}
7100
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 * Decode a byte string from a Windows code page into unicode object in strict
7103 * mode.
7104 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007105 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7106 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007109decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007110 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const char *in,
7112 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113{
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007115 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
7118 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 assert(insize > 0);
7120 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7121 if (outsize <= 0)
7122 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007126 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007127 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 if (*v == NULL)
7129 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 }
7132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007135 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138 }
7139
7140 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7142 if (outsize <= 0)
7143 goto error;
7144 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146error:
7147 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7148 return -2;
7149 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151}
7152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153/*
7154 * Decode a byte string from a code page into unicode object with an error
7155 * handler.
7156 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007157 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 * UnicodeDecodeError exception and returns -1 on error.
7159 */
7160static int
7161decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007162 PyObject **v,
7163 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007164 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007165{
7166 const char *startin = in;
7167 const char *endin = in + size;
7168 const DWORD flags = decode_code_page_flags(code_page);
7169 /* Ideally, we should get reason from FormatMessage. This is the Windows
7170 2000 English version of the message. */
7171 const char *reason = "No mapping for the Unicode character exists "
7172 "in the target code page.";
7173 /* each step cannot decode more than 1 character, but a character can be
7174 represented as a surrogate pair */
7175 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 int insize;
7177 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 PyObject *errorHandler = NULL;
7179 PyObject *exc = NULL;
7180 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007181 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 DWORD err;
7183 int ret = -1;
7184
7185 assert(size > 0);
7186
7187 encoding = code_page_name(code_page, &encoding_obj);
7188 if (encoding == NULL)
7189 return -1;
7190
Victor Stinner7d00cc12014-03-17 23:08:06 +01007191 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7193 UnicodeDecodeError. */
7194 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7195 if (exc != NULL) {
7196 PyCodec_StrictErrors(exc);
7197 Py_CLEAR(exc);
7198 }
7199 goto error;
7200 }
7201
7202 if (*v == NULL) {
7203 /* Create unicode object */
7204 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinnerab595942011-12-17 04:59:06 +01007208 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 if (*v == NULL)
7211 goto error;
7212 startout = PyUnicode_AS_UNICODE(*v);
7213 }
7214 else {
7215 /* Extend unicode object */
7216 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7217 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7218 PyErr_NoMemory();
7219 goto error;
7220 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007221 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 goto error;
7223 startout = PyUnicode_AS_UNICODE(*v) + n;
7224 }
7225
7226 /* Decode the byte string character per character */
7227 out = startout;
7228 while (in < endin)
7229 {
7230 /* Decode a character */
7231 insize = 1;
7232 do
7233 {
7234 outsize = MultiByteToWideChar(code_page, flags,
7235 in, insize,
7236 buffer, Py_ARRAY_LENGTH(buffer));
7237 if (outsize > 0)
7238 break;
7239 err = GetLastError();
7240 if (err != ERROR_NO_UNICODE_TRANSLATION
7241 && err != ERROR_INSUFFICIENT_BUFFER)
7242 {
7243 PyErr_SetFromWindowsErr(0);
7244 goto error;
7245 }
7246 insize++;
7247 }
7248 /* 4=maximum length of a UTF-8 sequence */
7249 while (insize <= 4 && (in + insize) <= endin);
7250
7251 if (outsize <= 0) {
7252 Py_ssize_t startinpos, endinpos, outpos;
7253
Victor Stinner7d00cc12014-03-17 23:08:06 +01007254 /* last character in partial decode? */
7255 if (in + insize >= endin && !final)
7256 break;
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 startinpos = in - startin;
7259 endinpos = startinpos + 1;
7260 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007261 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 errors, &errorHandler,
7263 encoding, reason,
7264 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007265 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 {
7267 goto error;
7268 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007269 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 }
7271 else {
7272 in += insize;
7273 memcpy(out, buffer, outsize * sizeof(wchar_t));
7274 out += outsize;
7275 }
7276 }
7277
7278 /* write a NUL character at the end */
7279 *out = 0;
7280
7281 /* Extend unicode object */
7282 outsize = out - startout;
7283 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007284 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007286 /* (in - startin) <= size and size is an int */
7287 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007288
7289error:
7290 Py_XDECREF(encoding_obj);
7291 Py_XDECREF(errorHandler);
7292 Py_XDECREF(exc);
7293 return ret;
7294}
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296static PyObject *
7297decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007298 const char *s, Py_ssize_t size,
7299 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007300{
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 PyObject *v = NULL;
7302 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 if (code_page < 0) {
7305 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7306 return NULL;
7307 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007308 if (size < 0) {
7309 PyErr_BadInternalCall();
7310 return NULL;
7311 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007312
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315
Victor Stinner76a31a62011-11-04 00:05:13 +01007316 do
7317 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 if (size > INT_MAX) {
7320 chunk_size = INT_MAX;
7321 final = 0;
7322 done = 0;
7323 }
7324 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 {
7327 chunk_size = (int)size;
7328 final = (consumed == NULL);
7329 done = 1;
7330 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 if (chunk_size == 0 && done) {
7333 if (v != NULL)
7334 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007335 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 converted = decode_code_page_strict(code_page, &v,
7339 s, chunk_size);
7340 if (converted == -2)
7341 converted = decode_code_page_errors(code_page, &v,
7342 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007343 errors, final);
7344 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007345
7346 if (converted < 0) {
7347 Py_XDECREF(v);
7348 return NULL;
7349 }
7350
7351 if (consumed)
7352 *consumed += converted;
7353
7354 s += converted;
7355 size -= converted;
7356 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007357
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007358 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359}
7360
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007362PyUnicode_DecodeCodePageStateful(int code_page,
7363 const char *s,
7364 Py_ssize_t size,
7365 const char *errors,
7366 Py_ssize_t *consumed)
7367{
7368 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7369}
7370
7371PyObject *
7372PyUnicode_DecodeMBCSStateful(const char *s,
7373 Py_ssize_t size,
7374 const char *errors,
7375 Py_ssize_t *consumed)
7376{
7377 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7378}
7379
7380PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381PyUnicode_DecodeMBCS(const char *s,
7382 Py_ssize_t size,
7383 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007384{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7386}
7387
Victor Stinner3a50e702011-10-18 21:21:00 +02007388static DWORD
7389encode_code_page_flags(UINT code_page, const char *errors)
7390{
7391 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007392 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 }
7394 else if (code_page == CP_UTF7) {
7395 /* CP_UTF7 only supports flags=0 */
7396 return 0;
7397 }
7398 else {
7399 if (errors != NULL && strcmp(errors, "replace") == 0)
7400 return 0;
7401 else
7402 return WC_NO_BEST_FIT_CHARS;
7403 }
7404}
7405
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 * Encode a Unicode string to a Windows code page into a byte string in strict
7408 * mode.
7409 *
7410 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007411 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007413static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007414encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007415 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417{
Victor Stinner554f3f02010-06-16 23:33:54 +00007418 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 BOOL *pusedDefaultChar = &usedDefaultChar;
7420 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007421 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007422 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 const DWORD flags = encode_code_page_flags(code_page, NULL);
7424 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 /* Create a substring so that we can get the UTF-16 representation
7426 of just the slice under consideration. */
7427 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007428
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007430
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007432 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007434 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007435
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436 substring = PyUnicode_Substring(unicode, offset, offset+len);
7437 if (substring == NULL)
7438 return -1;
7439 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7440 if (p == NULL) {
7441 Py_DECREF(substring);
7442 return -1;
7443 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007444 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007445
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007446 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007448 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 NULL, 0,
7450 NULL, pusedDefaultChar);
7451 if (outsize <= 0)
7452 goto error;
7453 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 if (pusedDefaultChar && *pusedDefaultChar) {
7455 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007458
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 if (*outbytes == NULL) {
7463 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467 }
7468 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 const Py_ssize_t n = PyBytes_Size(*outbytes);
7471 if (outsize > PY_SSIZE_T_MAX - n) {
7472 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7477 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481 }
7482
7483 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007485 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 out, outsize,
7487 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 if (outsize <= 0)
7490 goto error;
7491 if (pusedDefaultChar && *pusedDefaultChar)
7492 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7498 return -2;
7499 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007500 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
7502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007504 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 * error handler.
7506 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007507 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 * -1 on other error.
7509 */
7510static int
7511encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007514{
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007516 Py_ssize_t pos = unicode_offset;
7517 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 /* Ideally, we should get reason from FormatMessage. This is the Windows
7519 2000 English version of the message. */
7520 const char *reason = "invalid character";
7521 /* 4=maximum length of a UTF-8 sequence */
7522 char buffer[4];
7523 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7524 Py_ssize_t outsize;
7525 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 PyObject *errorHandler = NULL;
7527 PyObject *exc = NULL;
7528 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007529 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 PyObject *rep;
7532 int ret = -1;
7533
7534 assert(insize > 0);
7535
7536 encoding = code_page_name(code_page, &encoding_obj);
7537 if (encoding == NULL)
7538 return -1;
7539
7540 if (errors == NULL || strcmp(errors, "strict") == 0) {
7541 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7542 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007543 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 if (exc != NULL) {
7545 PyCodec_StrictErrors(exc);
7546 Py_DECREF(exc);
7547 }
7548 Py_XDECREF(encoding_obj);
7549 return -1;
7550 }
7551
7552 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7553 pusedDefaultChar = &usedDefaultChar;
7554 else
7555 pusedDefaultChar = NULL;
7556
7557 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7558 PyErr_NoMemory();
7559 goto error;
7560 }
7561 outsize = insize * Py_ARRAY_LENGTH(buffer);
7562
7563 if (*outbytes == NULL) {
7564 /* Create string object */
7565 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7566 if (*outbytes == NULL)
7567 goto error;
7568 out = PyBytes_AS_STRING(*outbytes);
7569 }
7570 else {
7571 /* Extend string object */
7572 Py_ssize_t n = PyBytes_Size(*outbytes);
7573 if (n > PY_SSIZE_T_MAX - outsize) {
7574 PyErr_NoMemory();
7575 goto error;
7576 }
7577 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7578 goto error;
7579 out = PyBytes_AS_STRING(*outbytes) + n;
7580 }
7581
7582 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007585 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7586 wchar_t chars[2];
7587 int charsize;
7588 if (ch < 0x10000) {
7589 chars[0] = (wchar_t)ch;
7590 charsize = 1;
7591 }
7592 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007593 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7594 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 charsize = 2;
7596 }
7597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 buffer, Py_ARRAY_LENGTH(buffer),
7601 NULL, pusedDefaultChar);
7602 if (outsize > 0) {
7603 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7604 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 memcpy(out, buffer, outsize);
7607 out += outsize;
7608 continue;
7609 }
7610 }
7611 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7612 PyErr_SetFromWindowsErr(0);
7613 goto error;
7614 }
7615
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 rep = unicode_encode_call_errorhandler(
7617 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007618 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 if (rep == NULL)
7621 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007623
7624 if (PyBytes_Check(rep)) {
7625 outsize = PyBytes_GET_SIZE(rep);
7626 if (outsize != 1) {
7627 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7628 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7629 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7630 Py_DECREF(rep);
7631 goto error;
7632 }
7633 out = PyBytes_AS_STRING(*outbytes) + offset;
7634 }
7635 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7636 out += outsize;
7637 }
7638 else {
7639 Py_ssize_t i;
7640 enum PyUnicode_Kind kind;
7641 void *data;
7642
Benjamin Petersonbac79492012-01-14 13:34:47 -05007643 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 Py_DECREF(rep);
7645 goto error;
7646 }
7647
7648 outsize = PyUnicode_GET_LENGTH(rep);
7649 if (outsize != 1) {
7650 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7651 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7652 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7653 Py_DECREF(rep);
7654 goto error;
7655 }
7656 out = PyBytes_AS_STRING(*outbytes) + offset;
7657 }
7658 kind = PyUnicode_KIND(rep);
7659 data = PyUnicode_DATA(rep);
7660 for (i=0; i < outsize; i++) {
7661 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7662 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007663 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007664 encoding, unicode,
7665 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 "unable to encode error handler result to ASCII");
7667 Py_DECREF(rep);
7668 goto error;
7669 }
7670 *out = (unsigned char)ch;
7671 out++;
7672 }
7673 }
7674 Py_DECREF(rep);
7675 }
7676 /* write a NUL byte */
7677 *out = 0;
7678 outsize = out - PyBytes_AS_STRING(*outbytes);
7679 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7680 if (_PyBytes_Resize(outbytes, outsize) < 0)
7681 goto error;
7682 ret = 0;
7683
7684error:
7685 Py_XDECREF(encoding_obj);
7686 Py_XDECREF(errorHandler);
7687 Py_XDECREF(exc);
7688 return ret;
7689}
7690
Victor Stinner3a50e702011-10-18 21:21:00 +02007691static PyObject *
7692encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007693 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 const char *errors)
7695{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007700
Victor Stinner29dacf22015-01-26 16:41:32 +01007701 if (!PyUnicode_Check(unicode)) {
7702 PyErr_BadArgument();
7703 return NULL;
7704 }
7705
Benjamin Petersonbac79492012-01-14 13:34:47 -05007706 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007707 return NULL;
7708 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007709
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 if (code_page < 0) {
7711 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7712 return NULL;
7713 }
7714
Martin v. Löwis3d325192011-11-04 18:23:06 +01007715 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007716 return PyBytes_FromStringAndSize(NULL, 0);
7717
Victor Stinner7581cef2011-11-03 22:32:33 +01007718 offset = 0;
7719 do
7720 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007721#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007722 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007723 chunks. */
7724 if (len > INT_MAX/2) {
7725 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 done = 0;
7727 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 done = 1;
7733 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007734
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 errors);
7738 if (ret == -2)
7739 ret = encode_code_page_errors(code_page, &outbytes,
7740 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 if (ret < 0) {
7743 Py_XDECREF(outbytes);
7744 return NULL;
7745 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 return outbytes;
7752}
7753
7754PyObject *
7755PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7756 Py_ssize_t size,
7757 const char *errors)
7758{
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007760 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007761 if (unicode == NULL)
7762 return NULL;
7763 res = encode_code_page(CP_ACP, unicode, errors);
7764 Py_DECREF(unicode);
7765 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007766}
7767
7768PyObject *
7769PyUnicode_EncodeCodePage(int code_page,
7770 PyObject *unicode,
7771 const char *errors)
7772{
Victor Stinner7581cef2011-11-03 22:32:33 +01007773 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007774}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776PyObject *
7777PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007778{
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007780}
7781
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782#undef NEED_RETRY
7783
Steve Dowercc16be82016-09-08 10:35:16 -07007784#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007785
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786/* --- Character Mapping Codec -------------------------------------------- */
7787
Victor Stinnerfb161b12013-04-18 01:44:27 +02007788static int
7789charmap_decode_string(const char *s,
7790 Py_ssize_t size,
7791 PyObject *mapping,
7792 const char *errors,
7793 _PyUnicodeWriter *writer)
7794{
7795 const char *starts = s;
7796 const char *e;
7797 Py_ssize_t startinpos, endinpos;
7798 PyObject *errorHandler = NULL, *exc = NULL;
7799 Py_ssize_t maplen;
7800 enum PyUnicode_Kind mapkind;
7801 void *mapdata;
7802 Py_UCS4 x;
7803 unsigned char ch;
7804
7805 if (PyUnicode_READY(mapping) == -1)
7806 return -1;
7807
7808 maplen = PyUnicode_GET_LENGTH(mapping);
7809 mapdata = PyUnicode_DATA(mapping);
7810 mapkind = PyUnicode_KIND(mapping);
7811
7812 e = s + size;
7813
7814 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7815 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7816 * is disabled in encoding aliases, latin1 is preferred because
7817 * its implementation is faster. */
7818 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7819 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7820 Py_UCS4 maxchar = writer->maxchar;
7821
7822 assert (writer->kind == PyUnicode_1BYTE_KIND);
7823 while (s < e) {
7824 ch = *s;
7825 x = mapdata_ucs1[ch];
7826 if (x > maxchar) {
7827 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7828 goto onError;
7829 maxchar = writer->maxchar;
7830 outdata = (Py_UCS1 *)writer->data;
7831 }
7832 outdata[writer->pos] = x;
7833 writer->pos++;
7834 ++s;
7835 }
7836 return 0;
7837 }
7838
7839 while (s < e) {
7840 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7841 enum PyUnicode_Kind outkind = writer->kind;
7842 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7843 if (outkind == PyUnicode_1BYTE_KIND) {
7844 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7845 Py_UCS4 maxchar = writer->maxchar;
7846 while (s < e) {
7847 ch = *s;
7848 x = mapdata_ucs2[ch];
7849 if (x > maxchar)
7850 goto Error;
7851 outdata[writer->pos] = x;
7852 writer->pos++;
7853 ++s;
7854 }
7855 break;
7856 }
7857 else if (outkind == PyUnicode_2BYTE_KIND) {
7858 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7859 while (s < e) {
7860 ch = *s;
7861 x = mapdata_ucs2[ch];
7862 if (x == 0xFFFE)
7863 goto Error;
7864 outdata[writer->pos] = x;
7865 writer->pos++;
7866 ++s;
7867 }
7868 break;
7869 }
7870 }
7871 ch = *s;
7872
7873 if (ch < maplen)
7874 x = PyUnicode_READ(mapkind, mapdata, ch);
7875 else
7876 x = 0xfffe; /* invalid value */
7877Error:
7878 if (x == 0xfffe)
7879 {
7880 /* undefined mapping */
7881 startinpos = s-starts;
7882 endinpos = startinpos+1;
7883 if (unicode_decode_call_errorhandler_writer(
7884 errors, &errorHandler,
7885 "charmap", "character maps to <undefined>",
7886 &starts, &e, &startinpos, &endinpos, &exc, &s,
7887 writer)) {
7888 goto onError;
7889 }
7890 continue;
7891 }
7892
7893 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7894 goto onError;
7895 ++s;
7896 }
7897 Py_XDECREF(errorHandler);
7898 Py_XDECREF(exc);
7899 return 0;
7900
7901onError:
7902 Py_XDECREF(errorHandler);
7903 Py_XDECREF(exc);
7904 return -1;
7905}
7906
7907static int
7908charmap_decode_mapping(const char *s,
7909 Py_ssize_t size,
7910 PyObject *mapping,
7911 const char *errors,
7912 _PyUnicodeWriter *writer)
7913{
7914 const char *starts = s;
7915 const char *e;
7916 Py_ssize_t startinpos, endinpos;
7917 PyObject *errorHandler = NULL, *exc = NULL;
7918 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007919 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007920
7921 e = s + size;
7922
7923 while (s < e) {
7924 ch = *s;
7925
7926 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7927 key = PyLong_FromLong((long)ch);
7928 if (key == NULL)
7929 goto onError;
7930
7931 item = PyObject_GetItem(mapping, key);
7932 Py_DECREF(key);
7933 if (item == NULL) {
7934 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7935 /* No mapping found means: mapping is undefined. */
7936 PyErr_Clear();
7937 goto Undefined;
7938 } else
7939 goto onError;
7940 }
7941
7942 /* Apply mapping */
7943 if (item == Py_None)
7944 goto Undefined;
7945 if (PyLong_Check(item)) {
7946 long value = PyLong_AS_LONG(item);
7947 if (value == 0xFFFE)
7948 goto Undefined;
7949 if (value < 0 || value > MAX_UNICODE) {
7950 PyErr_Format(PyExc_TypeError,
7951 "character mapping must be in range(0x%lx)",
7952 (unsigned long)MAX_UNICODE + 1);
7953 goto onError;
7954 }
7955
7956 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7957 goto onError;
7958 }
7959 else if (PyUnicode_Check(item)) {
7960 if (PyUnicode_READY(item) == -1)
7961 goto onError;
7962 if (PyUnicode_GET_LENGTH(item) == 1) {
7963 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7964 if (value == 0xFFFE)
7965 goto Undefined;
7966 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7967 goto onError;
7968 }
7969 else {
7970 writer->overallocate = 1;
7971 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7972 goto onError;
7973 }
7974 }
7975 else {
7976 /* wrong return value */
7977 PyErr_SetString(PyExc_TypeError,
7978 "character mapping must return integer, None or str");
7979 goto onError;
7980 }
7981 Py_CLEAR(item);
7982 ++s;
7983 continue;
7984
7985Undefined:
7986 /* undefined mapping */
7987 Py_CLEAR(item);
7988 startinpos = s-starts;
7989 endinpos = startinpos+1;
7990 if (unicode_decode_call_errorhandler_writer(
7991 errors, &errorHandler,
7992 "charmap", "character maps to <undefined>",
7993 &starts, &e, &startinpos, &endinpos, &exc, &s,
7994 writer)) {
7995 goto onError;
7996 }
7997 }
7998 Py_XDECREF(errorHandler);
7999 Py_XDECREF(exc);
8000 return 0;
8001
8002onError:
8003 Py_XDECREF(item);
8004 Py_XDECREF(errorHandler);
8005 Py_XDECREF(exc);
8006 return -1;
8007}
8008
Alexander Belopolsky40018472011-02-26 01:02:56 +00008009PyObject *
8010PyUnicode_DecodeCharmap(const char *s,
8011 Py_ssize_t size,
8012 PyObject *mapping,
8013 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008015 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 /* Default to Latin-1 */
8018 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008022 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008023 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008024 writer.min_length = size;
8025 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008027
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008028 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008029 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8030 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008031 }
8032 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008033 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8034 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008036 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008037
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008039 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 return NULL;
8041}
8042
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043/* Charmap encoding: the lookup table */
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 PyObject_HEAD
8047 unsigned char level1[32];
8048 int count2, count3;
8049 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050};
8051
8052static PyObject*
8053encoding_map_size(PyObject *obj, PyObject* args)
8054{
8055 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058}
8059
8060static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 PyDoc_STR("Return the size (in bytes) of this object") },
8063 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064};
8065
8066static void
8067encoding_map_dealloc(PyObject* o)
8068{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008069 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070}
8071
8072static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 "EncodingMap", /*tp_name*/
8075 sizeof(struct encoding_map), /*tp_basicsize*/
8076 0, /*tp_itemsize*/
8077 /* methods */
8078 encoding_map_dealloc, /*tp_dealloc*/
8079 0, /*tp_print*/
8080 0, /*tp_getattr*/
8081 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008082 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 0, /*tp_repr*/
8084 0, /*tp_as_number*/
8085 0, /*tp_as_sequence*/
8086 0, /*tp_as_mapping*/
8087 0, /*tp_hash*/
8088 0, /*tp_call*/
8089 0, /*tp_str*/
8090 0, /*tp_getattro*/
8091 0, /*tp_setattro*/
8092 0, /*tp_as_buffer*/
8093 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8094 0, /*tp_doc*/
8095 0, /*tp_traverse*/
8096 0, /*tp_clear*/
8097 0, /*tp_richcompare*/
8098 0, /*tp_weaklistoffset*/
8099 0, /*tp_iter*/
8100 0, /*tp_iternext*/
8101 encoding_map_methods, /*tp_methods*/
8102 0, /*tp_members*/
8103 0, /*tp_getset*/
8104 0, /*tp_base*/
8105 0, /*tp_dict*/
8106 0, /*tp_descr_get*/
8107 0, /*tp_descr_set*/
8108 0, /*tp_dictoffset*/
8109 0, /*tp_init*/
8110 0, /*tp_alloc*/
8111 0, /*tp_new*/
8112 0, /*tp_free*/
8113 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114};
8115
8116PyObject*
8117PyUnicode_BuildEncodingMap(PyObject* string)
8118{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 PyObject *result;
8120 struct encoding_map *mresult;
8121 int i;
8122 int need_dict = 0;
8123 unsigned char level1[32];
8124 unsigned char level2[512];
8125 unsigned char *mlevel1, *mlevel2, *mlevel3;
8126 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 int kind;
8128 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008129 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 PyErr_BadArgument();
8134 return NULL;
8135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 kind = PyUnicode_KIND(string);
8137 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008138 length = PyUnicode_GET_LENGTH(string);
8139 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 memset(level1, 0xFF, sizeof level1);
8141 memset(level2, 0xFF, sizeof level2);
8142
8143 /* If there isn't a one-to-one mapping of NULL to \0,
8144 or if there are non-BMP characters, we need to use
8145 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008148 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 ch = PyUnicode_READ(kind, data, i);
8151 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 need_dict = 1;
8153 break;
8154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 /* unmapped character */
8157 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 l1 = ch >> 11;
8159 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (level1[l1] == 0xFF)
8161 level1[l1] = count2++;
8162 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 }
8165
8166 if (count2 >= 0xFF || count3 >= 0xFF)
8167 need_dict = 1;
8168
8169 if (need_dict) {
8170 PyObject *result = PyDict_New();
8171 PyObject *key, *value;
8172 if (!result)
8173 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008174 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008176 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 if (!key || !value)
8178 goto failed1;
8179 if (PyDict_SetItem(result, key, value) == -1)
8180 goto failed1;
8181 Py_DECREF(key);
8182 Py_DECREF(value);
8183 }
8184 return result;
8185 failed1:
8186 Py_XDECREF(key);
8187 Py_XDECREF(value);
8188 Py_DECREF(result);
8189 return NULL;
8190 }
8191
8192 /* Create a three-level trie */
8193 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8194 16*count2 + 128*count3 - 1);
8195 if (!result)
8196 return PyErr_NoMemory();
8197 PyObject_Init(result, &EncodingMapType);
8198 mresult = (struct encoding_map*)result;
8199 mresult->count2 = count2;
8200 mresult->count3 = count3;
8201 mlevel1 = mresult->level1;
8202 mlevel2 = mresult->level23;
8203 mlevel3 = mresult->level23 + 16*count2;
8204 memcpy(mlevel1, level1, 32);
8205 memset(mlevel2, 0xFF, 16*count2);
8206 memset(mlevel3, 0, 128*count3);
8207 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008208 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8211 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212 /* unmapped character */
8213 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008214 o1 = ch>>11;
8215 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216 i2 = 16*mlevel1[o1] + o2;
8217 if (mlevel2[i2] == 0xFF)
8218 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008219 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 i3 = 128*mlevel2[i2] + o3;
8221 mlevel3[i3] = i;
8222 }
8223 return result;
8224}
8225
8226static int
Victor Stinner22168992011-11-20 17:09:18 +01008227encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228{
8229 struct encoding_map *map = (struct encoding_map*)mapping;
8230 int l1 = c>>11;
8231 int l2 = (c>>7) & 0xF;
8232 int l3 = c & 0x7F;
8233 int i;
8234
Victor Stinner22168992011-11-20 17:09:18 +01008235 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008237 if (c == 0)
8238 return 0;
8239 /* level 1*/
8240 i = map->level1[l1];
8241 if (i == 0xFF) {
8242 return -1;
8243 }
8244 /* level 2*/
8245 i = map->level23[16*i+l2];
8246 if (i == 0xFF) {
8247 return -1;
8248 }
8249 /* level 3 */
8250 i = map->level23[16*map->count2 + 128*i + l3];
8251 if (i == 0) {
8252 return -1;
8253 }
8254 return i;
8255}
8256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257/* Lookup the character ch in the mapping. If the character
8258 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008259 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008261charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262{
Christian Heimes217cfd12007-12-02 14:31:20 +00008263 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 PyObject *x;
8265
8266 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 x = PyObject_GetItem(mapping, w);
8269 Py_DECREF(w);
8270 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8272 /* No mapping found means: mapping is undefined. */
8273 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008274 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 } else
8276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008278 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 long value = PyLong_AS_LONG(x);
8282 if (value < 0 || value > 255) {
8283 PyErr_SetString(PyExc_TypeError,
8284 "character mapping must be in range(256)");
8285 Py_DECREF(x);
8286 return NULL;
8287 }
8288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 /* wrong return value */
8294 PyErr_Format(PyExc_TypeError,
8295 "character mapping must return integer, bytes or None, not %.400s",
8296 x->ob_type->tp_name);
8297 Py_DECREF(x);
8298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
8300}
8301
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008303charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8306 /* exponentially overallocate to minimize reallocations */
8307 if (requiredsize < 2*outsize)
8308 requiredsize = 2*outsize;
8309 if (_PyBytes_Resize(outobj, requiredsize))
8310 return -1;
8311 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312}
8313
Benjamin Peterson14339b62009-01-31 16:36:08 +00008314typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008316} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008318 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 space is available. Return a new reference to the object that
8320 was put in the output buffer, or Py_None, if the mapping was undefined
8321 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008322 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008324charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 PyObject *rep;
8328 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008329 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330
Christian Heimes90aa7642007-12-19 02:45:37 +00008331 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 if (res == -1)
8335 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize))
8338 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008339 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 outstart[(*outpos)++] = (char)res;
8341 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 }
8343
8344 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 Py_DECREF(rep);
8349 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 if (PyLong_Check(rep)) {
8352 Py_ssize_t requiredsize = *outpos+1;
8353 if (outsize<requiredsize)
8354 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8355 Py_DECREF(rep);
8356 return enc_EXCEPTION;
8357 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008358 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008360 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 else {
8362 const char *repchars = PyBytes_AS_STRING(rep);
8363 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8364 Py_ssize_t requiredsize = *outpos+repsize;
8365 if (outsize<requiredsize)
8366 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8367 Py_DECREF(rep);
8368 return enc_EXCEPTION;
8369 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 memcpy(outstart + *outpos, repchars, repsize);
8372 *outpos += repsize;
8373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 Py_DECREF(rep);
8376 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377}
8378
8379/* handle an error in PyUnicode_EncodeCharmap
8380 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381static int
8382charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008385 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008386 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387{
8388 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008390 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008391 enum PyUnicode_Kind kind;
8392 void *data;
8393 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t collstartpos = *inpos;
8396 Py_ssize_t collendpos = *inpos+1;
8397 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 char *encoding = "charmap";
8399 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008400 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008401 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008402 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403
Benjamin Petersonbac79492012-01-14 13:34:47 -05008404 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 return -1;
8406 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 /* find all unencodable characters */
8408 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008410 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008412 val = encoding_map_lookup(ch, mapping);
8413 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 break;
8415 ++collendpos;
8416 continue;
8417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8420 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (rep==NULL)
8422 return -1;
8423 else if (rep!=Py_None) {
8424 Py_DECREF(rep);
8425 break;
8426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 /* cache callback name lookup
8431 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008432 if (*error_handler == _Py_ERROR_UNKNOWN)
8433 *error_handler = get_error_handler(errors);
8434
8435 switch (*error_handler) {
8436 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008437 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008439
8440 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 x = charmapencode_output('?', mapping, res, respos);
8443 if (x==enc_EXCEPTION) {
8444 return -1;
8445 }
8446 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return -1;
8449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 }
8451 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008452 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 *inpos = collendpos;
8454 break;
Victor Stinner50149202015-09-22 00:26:54 +02008455
8456 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 /* generate replacement (temporarily (mis)uses p) */
8458 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 char buffer[2+29+1+1];
8460 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008461 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 for (cp = buffer; *cp; ++cp) {
8463 x = charmapencode_output(*cp, mapping, res, respos);
8464 if (x==enc_EXCEPTION)
8465 return -1;
8466 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
8469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 }
8471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 *inpos = collendpos;
8473 break;
Victor Stinner50149202015-09-22 00:26:54 +02008474
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 default:
Victor Stinner50149202015-09-22 00:26:54 +02008476 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008477 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008481 if (PyBytes_Check(repunicode)) {
8482 /* Directly copy bytes result to output. */
8483 Py_ssize_t outsize = PyBytes_Size(*res);
8484 Py_ssize_t requiredsize;
8485 repsize = PyBytes_Size(repunicode);
8486 requiredsize = *respos + repsize;
8487 if (requiredsize > outsize)
8488 /* Make room for all additional bytes. */
8489 if (charmapencode_resize(res, respos, requiredsize)) {
8490 Py_DECREF(repunicode);
8491 return -1;
8492 }
8493 memcpy(PyBytes_AsString(*res) + *respos,
8494 PyBytes_AsString(repunicode), repsize);
8495 *respos += repsize;
8496 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008497 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008498 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008501 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008502 Py_DECREF(repunicode);
8503 return -1;
8504 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008505 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008506 data = PyUnicode_DATA(repunicode);
8507 kind = PyUnicode_KIND(repunicode);
8508 for (index = 0; index < repsize; index++) {
8509 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8510 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
8514 }
8515 else if (x==enc_FAILED) {
8516 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008517 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 }
8521 *inpos = newpos;
8522 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 }
8524 return 0;
8525}
8526
Alexander Belopolsky40018472011-02-26 01:02:56 +00008527PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528_PyUnicode_EncodeCharmap(PyObject *unicode,
8529 PyObject *mapping,
8530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 /* output object */
8533 PyObject *res = NULL;
8534 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008539 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008541 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008542 void *data;
8543 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Benjamin Petersonbac79492012-01-14 13:34:47 -05008545 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 return NULL;
8547 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 data = PyUnicode_DATA(unicode);
8549 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 /* Default to Latin-1 */
8552 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 /* allocate enough for a simple encoding without
8556 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (res == NULL)
8559 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008560 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008564 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (x==enc_EXCEPTION) /* error */
8568 goto onError;
8569 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008572 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 &res, &respos)) {
8574 goto onError;
8575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 else
8578 /* done with this character => adjust input position */
8579 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008583 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008584 if (_PyBytes_Resize(&res, respos) < 0)
8585 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008588 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 return res;
8590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(res);
8593 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008594 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 return NULL;
8596}
8597
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008598/* Deprecated */
8599PyObject *
8600PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8601 Py_ssize_t size,
8602 PyObject *mapping,
8603 const char *errors)
8604{
8605 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008606 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 if (unicode == NULL)
8608 return NULL;
8609 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8610 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008611 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612}
8613
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614PyObject *
8615PyUnicode_AsCharmapString(PyObject *unicode,
8616 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617{
8618 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 PyErr_BadArgument();
8620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623}
8624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626static void
8627make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629 Py_ssize_t startpos, Py_ssize_t endpos,
8630 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 *exceptionObject = _PyUnicodeTranslateError_Create(
8634 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8638 goto onError;
8639 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8640 goto onError;
8641 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8642 goto onError;
8643 return;
8644 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008645 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
8647}
8648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649/* error handling callback helper:
8650 build arguments, call the callback and check the arguments,
8651 put the result into newpos and return the replacement string, which
8652 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008653static PyObject *
8654unicode_translate_call_errorhandler(const char *errors,
8655 PyObject **errorHandler,
8656 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658 Py_ssize_t startpos, Py_ssize_t endpos,
8659 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008661 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008663 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 PyObject *restuple;
8665 PyObject *resunicode;
8666
8667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 }
8672
8673 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008678 restuple = PyObject_CallFunctionObjArgs(
8679 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 Py_DECREF(restuple);
8685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008687 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 &resunicode, &i_newpos)) {
8689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 else
8695 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 Py_DECREF(restuple);
8699 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 Py_INCREF(resunicode);
8702 Py_DECREF(restuple);
8703 return resunicode;
8704}
8705
8706/* Lookup the character ch in the mapping and put the result in result,
8707 which must be decrefed by the caller.
8708 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008709static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711{
Christian Heimes217cfd12007-12-02 14:31:20 +00008712 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 PyObject *x;
8714
8715 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 x = PyObject_GetItem(mapping, w);
8718 Py_DECREF(w);
8719 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8721 /* No mapping found means: use 1:1 mapping. */
8722 PyErr_Clear();
8723 *result = NULL;
8724 return 0;
8725 } else
8726 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
8728 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 *result = x;
8730 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008732 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008734 if (value < 0 || value > MAX_UNICODE) {
8735 PyErr_Format(PyExc_ValueError,
8736 "character mapping must be in range(0x%x)",
8737 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 Py_DECREF(x);
8739 return -1;
8740 }
8741 *result = x;
8742 return 0;
8743 }
8744 else if (PyUnicode_Check(x)) {
8745 *result = x;
8746 return 0;
8747 }
8748 else {
8749 /* wrong return value */
8750 PyErr_SetString(PyExc_TypeError,
8751 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752 Py_DECREF(x);
8753 return -1;
8754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755}
Victor Stinner1194ea02014-04-04 19:37:40 +02008756
8757/* lookup the character, write the result into the writer.
8758 Return 1 if the result was written into the writer, return 0 if the mapping
8759 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008760static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008761charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8762 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763{
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 PyObject *item;
8765
8766 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008768
8769 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008776
8777 if (item == Py_None) {
8778 Py_DECREF(item);
8779 return 0;
8780 }
8781
8782 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008783 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8784 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8785 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008786 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8787 Py_DECREF(item);
8788 return -1;
8789 }
8790 Py_DECREF(item);
8791 return 1;
8792 }
8793
8794 if (!PyUnicode_Check(item)) {
8795 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008797 }
8798
8799 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8800 Py_DECREF(item);
8801 return -1;
8802 }
8803
8804 Py_DECREF(item);
8805 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806}
8807
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808static int
8809unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8810 Py_UCS1 *translate)
8811{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008812 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 int ret = 0;
8814
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 if (charmaptranslate_lookup(ch, mapping, &item)) {
8816 return -1;
8817 }
8818
8819 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008821 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008823 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 /* not found => default to 1:1 mapping */
8825 translate[ch] = ch;
8826 return 1;
8827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008829 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008830 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8831 used it */
8832 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008833 /* invalid character or character outside ASCII:
8834 skip the fast translate */
8835 goto exit;
8836 }
8837 translate[ch] = (Py_UCS1)replace;
8838 }
8839 else if (PyUnicode_Check(item)) {
8840 Py_UCS4 replace;
8841
8842 if (PyUnicode_READY(item) == -1) {
8843 Py_DECREF(item);
8844 return -1;
8845 }
8846 if (PyUnicode_GET_LENGTH(item) != 1)
8847 goto exit;
8848
8849 replace = PyUnicode_READ_CHAR(item, 0);
8850 if (replace > 127)
8851 goto exit;
8852 translate[ch] = (Py_UCS1)replace;
8853 }
8854 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008855 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 goto exit;
8857 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 ret = 1;
8859
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 exit:
8861 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 return ret;
8863}
8864
8865/* Fast path for ascii => ascii translation. Return 1 if the whole string
8866 was translated into writer, return 0 if the input string was partially
8867 translated into writer, raise an exception and return -1 on error. */
8868static int
8869unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008870 _PyUnicodeWriter *writer, int ignore,
8871 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872{
Victor Stinner872b2912014-04-05 14:27:07 +02008873 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 Py_ssize_t len;
8875 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008876 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878 len = PyUnicode_GET_LENGTH(input);
8879
Victor Stinner872b2912014-04-05 14:27:07 +02008880 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881
8882 in = PyUnicode_1BYTE_DATA(input);
8883 end = in + len;
8884
8885 assert(PyUnicode_IS_ASCII(writer->buffer));
8886 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8887 out = PyUnicode_1BYTE_DATA(writer->buffer);
8888
Victor Stinner872b2912014-04-05 14:27:07 +02008889 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008893 int translate = unicode_fast_translate_lookup(mapping, ch,
8894 ascii_table);
8895 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008897 if (translate == 0)
8898 goto exit;
8899 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 }
Victor Stinner872b2912014-04-05 14:27:07 +02008901 if (ch2 == 0xfe) {
8902 if (ignore)
8903 continue;
8904 goto exit;
8905 }
8906 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008908 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 }
Victor Stinner872b2912014-04-05 14:27:07 +02008910 res = 1;
8911
8912exit:
8913 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008914 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008915 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916}
8917
Victor Stinner3222da22015-10-01 22:07:32 +02008918static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919_PyUnicode_TranslateCharmap(PyObject *input,
8920 PyObject *mapping,
8921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 Py_ssize_t size, i;
8926 int kind;
8927 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008928 _PyUnicodeWriter writer;
8929 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 char *reason = "character maps to <undefined>";
8931 PyObject *errorHandler = NULL;
8932 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 PyErr_BadArgument();
8938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 if (PyUnicode_READY(input) == -1)
8942 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 kind = PyUnicode_KIND(input);
8945 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008947 if (size == 0)
8948 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950 /* allocate enough for a simple 1:1 translation without
8951 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008952 _PyUnicodeWriter_Init(&writer);
8953 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
Victor Stinner872b2912014-04-05 14:27:07 +02008956 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8957
Victor Stinner33798672016-03-01 21:59:58 +01008958 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008960 if (PyUnicode_IS_ASCII(input)) {
8961 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8962 if (res < 0) {
8963 _PyUnicodeWriter_Dealloc(&writer);
8964 return NULL;
8965 }
8966 if (res == 1)
8967 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 }
Victor Stinner33798672016-03-01 21:59:58 +01008969 else {
8970 i = 0;
8971 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008975 int translate;
8976 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8977 Py_ssize_t newpos;
8978 /* startpos for collecting untranslatable chars */
8979 Py_ssize_t collstart;
8980 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 ch = PyUnicode_READ(kind, data, i);
8984 translate = charmaptranslate_output(ch, mapping, &writer);
8985 if (translate < 0)
8986 goto onError;
8987
8988 if (translate != 0) {
8989 /* it worked => adjust input pointer */
8990 ++i;
8991 continue;
8992 }
8993
8994 /* untranslatable character */
8995 collstart = i;
8996 collend = i+1;
8997
8998 /* find all untranslatable characters */
8999 while (collend < size) {
9000 PyObject *x;
9001 ch = PyUnicode_READ(kind, data, collend);
9002 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009003 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 Py_XDECREF(x);
9005 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 ++collend;
9008 }
9009
9010 if (ignore) {
9011 i = collend;
9012 }
9013 else {
9014 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9015 reason, input, &exc,
9016 collstart, collend, &newpos);
9017 if (repunicode == NULL)
9018 goto onError;
9019 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 Py_DECREF(repunicode);
9024 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 }
9026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 Py_XDECREF(exc);
9028 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 Py_XDECREF(exc);
9034 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 return NULL;
9036}
9037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038/* Deprecated. Use PyUnicode_Translate instead. */
9039PyObject *
9040PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9041 Py_ssize_t size,
9042 PyObject *mapping,
9043 const char *errors)
9044{
Christian Heimes5f520f42012-09-11 14:03:25 +02009045 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009046 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (!unicode)
9048 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009049 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9050 Py_DECREF(unicode);
9051 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052}
9053
Alexander Belopolsky40018472011-02-26 01:02:56 +00009054PyObject *
9055PyUnicode_Translate(PyObject *str,
9056 PyObject *mapping,
9057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009059 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009060 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009061 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
Tim Petersced69f82003-09-16 20:30:58 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009065fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066{
9067 /* No need to call PyUnicode_READY(self) because this function is only
9068 called as a callback from fixup() which does it already. */
9069 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9070 const int kind = PyUnicode_KIND(self);
9071 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009072 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009073 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 Py_ssize_t i;
9075
9076 for (i = 0; i < len; ++i) {
9077 ch = PyUnicode_READ(kind, data, i);
9078 fixed = 0;
9079 if (ch > 127) {
9080 if (Py_UNICODE_ISSPACE(ch))
9081 fixed = ' ';
9082 else {
9083 const int decimal = Py_UNICODE_TODECIMAL(ch);
9084 if (decimal >= 0)
9085 fixed = '0' + decimal;
9086 }
9087 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009088 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009089 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 PyUnicode_WRITE(kind, data, i, fixed);
9091 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009092 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009093 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 }
9096
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009097 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098}
9099
9100PyObject *
9101_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9102{
9103 if (!PyUnicode_Check(unicode)) {
9104 PyErr_BadInternalCall();
9105 return NULL;
9106 }
9107 if (PyUnicode_READY(unicode) == -1)
9108 return NULL;
9109 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9110 /* If the string is already ASCII, just return the same string */
9111 Py_INCREF(unicode);
9112 return unicode;
9113 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009114 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115}
9116
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009117PyObject *
9118PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9119 Py_ssize_t length)
9120{
Victor Stinnerf0124502011-11-21 23:12:56 +01009121 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009122 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009123 Py_UCS4 maxchar;
9124 enum PyUnicode_Kind kind;
9125 void *data;
9126
Victor Stinner99d7ad02012-02-22 13:37:39 +01009127 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009128 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009129 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130 if (ch > 127) {
9131 int decimal = Py_UNICODE_TODECIMAL(ch);
9132 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009133 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009134 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009135 }
9136 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009137
9138 /* Copy to a new string */
9139 decimal = PyUnicode_New(length, maxchar);
9140 if (decimal == NULL)
9141 return decimal;
9142 kind = PyUnicode_KIND(decimal);
9143 data = PyUnicode_DATA(decimal);
9144 /* Iterate over code points */
9145 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009146 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009147 if (ch > 127) {
9148 int decimal = Py_UNICODE_TODECIMAL(ch);
9149 if (decimal >= 0)
9150 ch = '0' + decimal;
9151 }
9152 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009154 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009155}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009156/* --- Decimal Encoder ---------------------------------------------------- */
9157
Alexander Belopolsky40018472011-02-26 01:02:56 +00009158int
9159PyUnicode_EncodeDecimal(Py_UNICODE *s,
9160 Py_ssize_t length,
9161 char *output,
9162 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009163{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009164 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009165 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009166 enum PyUnicode_Kind kind;
9167 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009168
9169 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 PyErr_BadArgument();
9171 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172 }
9173
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009174 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009175 if (unicode == NULL)
9176 return -1;
9177
Victor Stinner42bf7752011-11-21 22:52:58 +01009178 kind = PyUnicode_KIND(unicode);
9179 data = PyUnicode_DATA(unicode);
9180
Victor Stinnerb84d7232011-11-22 01:50:07 +01009181 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009182 PyObject *exc;
9183 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009185 Py_ssize_t startpos;
9186
9187 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009188
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009190 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009191 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 decimal = Py_UNICODE_TODECIMAL(ch);
9195 if (decimal >= 0) {
9196 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009197 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 continue;
9199 }
9200 if (0 < ch && ch < 256) {
9201 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009202 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 continue;
9204 }
Victor Stinner6345be92011-11-25 20:09:01 +01009205
Victor Stinner42bf7752011-11-21 22:52:58 +01009206 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009207 exc = NULL;
9208 raise_encode_exception(&exc, "decimal", unicode,
9209 startpos, startpos+1,
9210 "invalid decimal Unicode string");
9211 Py_XDECREF(exc);
9212 Py_DECREF(unicode);
9213 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009214 }
9215 /* 0-terminate the output string */
9216 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009217 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009218 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219}
9220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221/* --- Helpers ------------------------------------------------------------ */
9222
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009223/* helper macro to fixup start/end slice values */
9224#define ADJUST_INDICES(start, end, len) \
9225 if (end > len) \
9226 end = len; \
9227 else if (end < 0) { \
9228 end += len; \
9229 if (end < 0) \
9230 end = 0; \
9231 } \
9232 if (start < 0) { \
9233 start += len; \
9234 if (start < 0) \
9235 start = 0; \
9236 }
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009239any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009241 Py_ssize_t end,
9242 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009244 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 void *buf1, *buf2;
9246 Py_ssize_t len1, len2, result;
9247
9248 kind1 = PyUnicode_KIND(s1);
9249 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009250 if (kind1 < kind2)
9251 return -1;
9252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 len1 = PyUnicode_GET_LENGTH(s1);
9254 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009255 ADJUST_INDICES(start, end, len1);
9256 if (end - start < len2)
9257 return -1;
9258
9259 buf1 = PyUnicode_DATA(s1);
9260 buf2 = PyUnicode_DATA(s2);
9261 if (len2 == 1) {
9262 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9263 result = findchar((const char *)buf1 + kind1*start,
9264 kind1, end - start, ch, direction);
9265 if (result == -1)
9266 return -1;
9267 else
9268 return start + result;
9269 }
9270
9271 if (kind2 != kind1) {
9272 buf2 = _PyUnicode_AsKind(s2, kind1);
9273 if (!buf2)
9274 return -2;
9275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276
Victor Stinner794d5672011-10-10 03:21:36 +02009277 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009278 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009279 case PyUnicode_1BYTE_KIND:
9280 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9281 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9282 else
9283 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9284 break;
9285 case PyUnicode_2BYTE_KIND:
9286 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9287 break;
9288 case PyUnicode_4BYTE_KIND:
9289 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 default:
9292 assert(0); result = -2;
9293 }
9294 }
9295 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009296 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009297 case PyUnicode_1BYTE_KIND:
9298 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9299 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9300 else
9301 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 break;
9303 case PyUnicode_2BYTE_KIND:
9304 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305 break;
9306 case PyUnicode_4BYTE_KIND:
9307 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 default:
9310 assert(0); result = -2;
9311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 }
9313
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009314 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 PyMem_Free(buf2);
9316
9317 return result;
9318}
9319
9320Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009321_PyUnicode_InsertThousandsGrouping(
9322 PyObject *unicode, Py_ssize_t index,
9323 Py_ssize_t n_buffer,
9324 void *digits, Py_ssize_t n_digits,
9325 Py_ssize_t min_width,
9326 const char *grouping, PyObject *thousands_sep,
9327 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328{
Victor Stinner41a863c2012-02-24 00:37:51 +01009329 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009330 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 Py_ssize_t thousands_sep_len;
9332 Py_ssize_t len;
9333
9334 if (unicode != NULL) {
9335 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009336 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 }
9338 else {
9339 kind = PyUnicode_1BYTE_KIND;
9340 data = NULL;
9341 }
9342 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9343 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9344 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9345 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009346 if (thousands_sep_kind < kind) {
9347 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9348 if (!thousands_sep_data)
9349 return -1;
9350 }
9351 else {
9352 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9353 if (!data)
9354 return -1;
9355 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009356 }
9357
Benjamin Petersonead6b532011-12-20 17:23:42 -06009358 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009360 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009362 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009365 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009369 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009373 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009379 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 break;
9383 default:
9384 assert(0);
9385 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009387 if (unicode != NULL && thousands_sep_kind != kind) {
9388 if (thousands_sep_kind < kind)
9389 PyMem_Free(thousands_sep_data);
9390 else
9391 PyMem_Free(data);
9392 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 if (unicode == NULL) {
9394 *maxchar = 127;
9395 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009396 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009397 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
9399 }
9400 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401}
9402
9403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404Py_ssize_t
9405PyUnicode_Count(PyObject *str,
9406 PyObject *substr,
9407 Py_ssize_t start,
9408 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009410 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 void *buf1 = NULL, *buf2 = NULL;
9413 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009417
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 kind1 = PyUnicode_KIND(str);
9419 kind2 = PyUnicode_KIND(substr);
9420 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 len1 = PyUnicode_GET_LENGTH(str);
9424 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009426 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009429 buf1 = PyUnicode_DATA(str);
9430 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (!buf2)
9434 goto onError;
9435 }
9436
9437 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009440 result = asciilib_count(
9441 ((Py_UCS1*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 else
9445 result = ucs1lib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 result = ucs2lib_count(
9452 ((Py_UCS2*)buf1) + start, end - start,
9453 buf2, len2, PY_SSIZE_T_MAX
9454 );
9455 break;
9456 case PyUnicode_4BYTE_KIND:
9457 result = ucs4lib_count(
9458 ((Py_UCS4*)buf1) + start, end - start,
9459 buf2, len2, PY_SSIZE_T_MAX
9460 );
9461 break;
9462 default:
9463 assert(0); result = 0;
9464 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 PyMem_Free(buf2);
9468
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Alexander Belopolsky40018472011-02-26 01:02:56 +00009476Py_ssize_t
9477PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009479 Py_ssize_t start,
9480 Py_ssize_t end,
9481 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009485
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009486 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487}
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489Py_ssize_t
9490PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9491 Py_ssize_t start, Py_ssize_t end,
9492 int direction)
9493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009495 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (PyUnicode_READY(str) == -1)
9497 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009498 len = PyUnicode_GET_LENGTH(str);
9499 ADJUST_INDICES(start, end, len);
9500 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9504 kind, end-start, ch, direction);
9505 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009507 else
9508 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009512tailmatch(PyObject *self,
9513 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 int kind_self;
9519 int kind_sub;
9520 void *data_self;
9521 void *data_sub;
9522 Py_ssize_t offset;
9523 Py_ssize_t i;
9524 Py_ssize_t end_sub;
9525
9526 if (PyUnicode_READY(self) == -1 ||
9527 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9531 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009535 if (PyUnicode_GET_LENGTH(substring) == 0)
9536 return 1;
9537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 kind_self = PyUnicode_KIND(self);
9539 data_self = PyUnicode_DATA(self);
9540 kind_sub = PyUnicode_KIND(substring);
9541 data_sub = PyUnicode_DATA(substring);
9542 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9543
9544 if (direction > 0)
9545 offset = end;
9546 else
9547 offset = start;
9548
9549 if (PyUnicode_READ(kind_self, data_self, offset) ==
9550 PyUnicode_READ(kind_sub, data_sub, 0) &&
9551 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9552 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9553 /* If both are of the same kind, memcmp is sufficient */
9554 if (kind_self == kind_sub) {
9555 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009556 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 data_sub,
9558 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009559 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009561 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 else {
9563 /* We do not need to compare 0 and len(substring)-1 because
9564 the if statement above ensured already that they are equal
9565 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 for (i = 1; i < end_sub; ++i) {
9567 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9568 PyUnicode_READ(kind_sub, data_sub, i))
9569 return 0;
9570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 }
9574
9575 return 0;
9576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Tailmatch(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589}
9590
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591/* Apply fixfct filter to the Unicode object self and return a
9592 reference to the modified object */
9593
Alexander Belopolsky40018472011-02-26 01:02:56 +00009594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009595fixup(PyObject *self,
9596 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 PyObject *u;
9599 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009600 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009602 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009605 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 /* fix functions return the new maximum character in a string,
9608 if the kind of the resulting unicode object does not change,
9609 everything is fine. Otherwise we need to change the string kind
9610 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009611 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009612
9613 if (maxchar_new == 0) {
9614 /* no changes */;
9615 if (PyUnicode_CheckExact(self)) {
9616 Py_DECREF(u);
9617 Py_INCREF(self);
9618 return self;
9619 }
9620 else
9621 return u;
9622 }
9623
Victor Stinnere6abb482012-05-02 01:15:40 +02009624 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625
Victor Stinnereaab6042011-12-11 22:22:39 +01009626 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009628
9629 /* In case the maximum character changed, we need to
9630 convert the string to the new category. */
9631 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9632 if (v == NULL) {
9633 Py_DECREF(u);
9634 return NULL;
9635 }
9636 if (maxchar_new > maxchar_old) {
9637 /* If the maxchar increased so that the kind changed, not all
9638 characters are representable anymore and we need to fix the
9639 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009640 _PyUnicode_FastCopyCharacters(v, 0,
9641 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009642 maxchar_old = fixfct(v);
9643 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 }
9645 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009646 _PyUnicode_FastCopyCharacters(v, 0,
9647 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009649 Py_DECREF(u);
9650 assert(_PyUnicode_CheckConsistency(v, 1));
9651 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654static PyObject *
9655ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9658 char *resdata, *data = PyUnicode_DATA(self);
9659 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009660
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 res = PyUnicode_New(len, 127);
9662 if (res == NULL)
9663 return NULL;
9664 resdata = PyUnicode_DATA(res);
9665 if (lower)
9666 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 _Py_bytes_upper(resdata, data, len);
9669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 Py_ssize_t j;
9676 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009677 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009679
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9681
9682 where ! is a negation and \p{xxx} is a character with property xxx.
9683 */
9684 for (j = i - 1; j >= 0; j--) {
9685 c = PyUnicode_READ(kind, data, j);
9686 if (!_PyUnicode_IsCaseIgnorable(c))
9687 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9690 if (final_sigma) {
9691 for (j = i + 1; j < length; j++) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
9695 }
9696 final_sigma = j == length || !_PyUnicode_IsCased(c);
9697 }
9698 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static int
9702lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9703 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 /* Obscure special case. */
9706 if (c == 0x3A3) {
9707 mapped[0] = handle_capital_sigma(kind, data, length, i);
9708 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711}
9712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713static Py_ssize_t
9714do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 Py_ssize_t i, k = 0;
9717 int n_res, j;
9718 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 c = PyUnicode_READ(kind, data, 0);
9721 n_res = _PyUnicode_ToUpperFull(c, mapped);
9722 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009723 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 for (i = 1; i < length; i++) {
9727 c = PyUnicode_READ(kind, data, i);
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009732 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static Py_ssize_t
9738do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9739 Py_ssize_t i, k = 0;
9740
9741 for (i = 0; i < length; i++) {
9742 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9743 int n_res, j;
9744 if (Py_UNICODE_ISUPPER(c)) {
9745 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9746 }
9747 else if (Py_UNICODE_ISLOWER(c)) {
9748 n_res = _PyUnicode_ToUpperFull(c, mapped);
9749 }
9750 else {
9751 n_res = 1;
9752 mapped[0] = c;
9753 }
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
9757 }
9758 }
9759 return k;
9760}
9761
9762static Py_ssize_t
9763do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9764 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 Py_ssize_t i, k = 0;
9767
9768 for (i = 0; i < length; i++) {
9769 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9770 int n_res, j;
9771 if (lower)
9772 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9773 else
9774 n_res = _PyUnicode_ToUpperFull(c, mapped);
9775 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 res[k++] = mapped[j];
9778 }
9779 }
9780 return k;
9781}
9782
9783static Py_ssize_t
9784do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9785{
9786 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9787}
9788
9789static Py_ssize_t
9790do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9793}
9794
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009796do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799
9800 for (i = 0; i < length; i++) {
9801 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9802 Py_UCS4 mapped[3];
9803 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009806 res[k++] = mapped[j];
9807 }
9808 }
9809 return k;
9810}
9811
9812static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009813do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9814{
9815 Py_ssize_t i, k = 0;
9816 int previous_is_cased;
9817
9818 previous_is_cased = 0;
9819 for (i = 0; i < length; i++) {
9820 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9821 Py_UCS4 mapped[3];
9822 int n_res, j;
9823
9824 if (previous_is_cased)
9825 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9826 else
9827 n_res = _PyUnicode_ToTitleFull(c, mapped);
9828
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831 res[k++] = mapped[j];
9832 }
9833
9834 previous_is_cased = _PyUnicode_IsCased(c);
9835 }
9836 return k;
9837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static PyObject *
9840case_operation(PyObject *self,
9841 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9842{
9843 PyObject *res = NULL;
9844 Py_ssize_t length, newlength = 0;
9845 int kind, outkind;
9846 void *data, *outdata;
9847 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9848
Benjamin Petersoneea48462012-01-16 14:28:50 -05009849 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850
9851 kind = PyUnicode_KIND(self);
9852 data = PyUnicode_DATA(self);
9853 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009854 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009855 PyErr_SetString(PyExc_OverflowError, "string is too long");
9856 return NULL;
9857 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009858 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 if (tmp == NULL)
9860 return PyErr_NoMemory();
9861 newlength = perform(kind, data, length, tmp, &maxchar);
9862 res = PyUnicode_New(newlength, maxchar);
9863 if (res == NULL)
9864 goto leave;
9865 tmpend = tmp + newlength;
9866 outdata = PyUnicode_DATA(res);
9867 outkind = PyUnicode_KIND(res);
9868 switch (outkind) {
9869 case PyUnicode_1BYTE_KIND:
9870 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9871 break;
9872 case PyUnicode_2BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9877 break;
9878 default:
9879 assert(0);
9880 break;
9881 }
9882 leave:
9883 PyMem_FREE(tmp);
9884 return res;
9885}
9886
Tim Peters8ce9f162004-08-27 01:49:32 +00009887PyObject *
9888PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009890 PyObject *res;
9891 PyObject *fseq;
9892 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009893 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009895 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009896 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009897 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009898 }
9899
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009900 /* NOTE: the following code can't call back into Python code,
9901 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009902 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009903
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009904 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009905 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009906 res = _PyUnicode_JoinArray(separator, items, seqlen);
9907 Py_DECREF(fseq);
9908 return res;
9909}
9910
9911PyObject *
9912_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9913{
9914 PyObject *res = NULL; /* the result */
9915 PyObject *sep = NULL;
9916 Py_ssize_t seplen;
9917 PyObject *item;
9918 Py_ssize_t sz, i, res_offset;
9919 Py_UCS4 maxchar;
9920 Py_UCS4 item_maxchar;
9921 int use_memcpy;
9922 unsigned char *res_data = NULL, *sep_data = NULL;
9923 PyObject *last_obj;
9924 unsigned int kind = 0;
9925
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 /* If empty sequence, return u"". */
9927 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009928 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009930
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 if (seqlen == 1) {
9934 if (PyUnicode_CheckExact(items[0])) {
9935 res = items[0];
9936 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 return res;
9938 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009939 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009940 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009941 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009943 /* Set up sep and seplen */
9944 if (separator == NULL) {
9945 /* fall back to a blank space separator */
9946 sep = PyUnicode_FromOrdinal(' ');
9947 if (!sep)
9948 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009949 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009950 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009951 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009952 else {
9953 if (!PyUnicode_Check(separator)) {
9954 PyErr_Format(PyExc_TypeError,
9955 "separator: expected str instance,"
9956 " %.80s found",
9957 Py_TYPE(separator)->tp_name);
9958 goto onError;
9959 }
9960 if (PyUnicode_READY(separator))
9961 goto onError;
9962 sep = separator;
9963 seplen = PyUnicode_GET_LENGTH(separator);
9964 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9965 /* inc refcount to keep this code path symmetric with the
9966 above case of a blank separator */
9967 Py_INCREF(sep);
9968 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009969 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009970 }
9971
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009972 /* There are at least two things to join, or else we have a subclass
9973 * of str in the sequence.
9974 * Do a pre-pass to figure out the total amount of space we'll
9975 * need (sz), and see whether all argument are strings.
9976 */
9977 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009978#ifdef Py_DEBUG
9979 use_memcpy = 0;
9980#else
9981 use_memcpy = 1;
9982#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009984 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 if (!PyUnicode_Check(item)) {
9987 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009988 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 " %.80s found",
9990 i, Py_TYPE(item)->tp_name);
9991 goto onError;
9992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 if (PyUnicode_READY(item) == -1)
9994 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009995 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009997 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009998 if (i != 0) {
9999 add_sz += seplen;
10000 }
10001 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 goto onError;
10005 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010006 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 if (use_memcpy && last_obj != NULL) {
10008 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10009 use_memcpy = 0;
10010 }
10011 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 }
Tim Petersced69f82003-09-16 20:30:58 +000010013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 if (res == NULL)
10016 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010017
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010018 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010019#ifdef Py_DEBUG
10020 use_memcpy = 0;
10021#else
10022 if (use_memcpy) {
10023 res_data = PyUnicode_1BYTE_DATA(res);
10024 kind = PyUnicode_KIND(res);
10025 if (seplen != 0)
10026 sep_data = PyUnicode_1BYTE_DATA(sep);
10027 }
10028#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010029 if (use_memcpy) {
10030 for (i = 0; i < seqlen; ++i) {
10031 Py_ssize_t itemlen;
10032 item = items[i];
10033
10034 /* Copy item, and maybe the separator. */
10035 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010036 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010038 kind * seplen);
10039 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010041
10042 itemlen = PyUnicode_GET_LENGTH(item);
10043 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010044 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010046 kind * itemlen);
10047 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010049 }
10050 assert(res_data == PyUnicode_1BYTE_DATA(res)
10051 + kind * PyUnicode_GET_LENGTH(res));
10052 }
10053 else {
10054 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10055 Py_ssize_t itemlen;
10056 item = items[i];
10057
10058 /* Copy item, and maybe the separator. */
10059 if (i && seplen != 0) {
10060 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10061 res_offset += seplen;
10062 }
10063
10064 itemlen = PyUnicode_GET_LENGTH(item);
10065 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010066 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010067 res_offset += itemlen;
10068 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010069 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010071 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010074 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010079 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080 return NULL;
10081}
10082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083#define FILL(kind, data, value, start, length) \
10084 do { \
10085 Py_ssize_t i_ = 0; \
10086 assert(kind != PyUnicode_WCHAR_KIND); \
10087 switch ((kind)) { \
10088 case PyUnicode_1BYTE_KIND: { \
10089 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010090 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 break; \
10092 } \
10093 case PyUnicode_2BYTE_KIND: { \
10094 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10095 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10096 break; \
10097 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010098 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10100 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10101 break; \
10102 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010103 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 } \
10105 } while (0)
10106
Victor Stinnerd3f08822012-05-29 12:57:52 +020010107void
10108_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10109 Py_UCS4 fill_char)
10110{
10111 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10112 const void *data = PyUnicode_DATA(unicode);
10113 assert(PyUnicode_IS_READY(unicode));
10114 assert(unicode_modifiable(unicode));
10115 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10116 assert(start >= 0);
10117 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10118 FILL(kind, data, fill_char, start, length);
10119}
10120
Victor Stinner3fe55312012-01-04 00:33:50 +010010121Py_ssize_t
10122PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123 Py_UCS4 fill_char)
10124{
10125 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010126
10127 if (!PyUnicode_Check(unicode)) {
10128 PyErr_BadInternalCall();
10129 return -1;
10130 }
10131 if (PyUnicode_READY(unicode) == -1)
10132 return -1;
10133 if (unicode_check_modifiable(unicode))
10134 return -1;
10135
Victor Stinnerd3f08822012-05-29 12:57:52 +020010136 if (start < 0) {
10137 PyErr_SetString(PyExc_IndexError, "string index out of range");
10138 return -1;
10139 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010140 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10141 PyErr_SetString(PyExc_ValueError,
10142 "fill character is bigger than "
10143 "the string maximum character");
10144 return -1;
10145 }
10146
10147 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10148 length = Py_MIN(maxlen, length);
10149 if (length <= 0)
10150 return 0;
10151
Victor Stinnerd3f08822012-05-29 12:57:52 +020010152 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010153 return length;
10154}
10155
Victor Stinner9310abb2011-10-05 00:59:23 +020010156static PyObject *
10157pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010158 Py_ssize_t left,
10159 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyObject *u;
10163 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010164 int kind;
10165 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
10167 if (left < 0)
10168 left = 0;
10169 if (right < 0)
10170 right = 0;
10171
Victor Stinnerc4b49542011-12-11 22:44:26 +010010172 if (left == 0 && right == 0)
10173 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10176 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010177 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10178 return NULL;
10179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010181 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010183 if (!u)
10184 return NULL;
10185
10186 kind = PyUnicode_KIND(u);
10187 data = PyUnicode_DATA(u);
10188 if (left)
10189 FILL(kind, data, fill, 0, left);
10190 if (right)
10191 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010192 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010193 assert(_PyUnicode_CheckConsistency(u, 1));
10194 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195}
10196
Alexander Belopolsky40018472011-02-26 01:02:56 +000010197PyObject *
10198PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010202 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Benjamin Petersonead6b532011-12-20 17:23:42 -060010205 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 if (PyUnicode_IS_ASCII(string))
10208 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 PyUnicode_GET_LENGTH(string), keepends);
10211 else
10212 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 break;
10216 case PyUnicode_2BYTE_KIND:
10217 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 PyUnicode_GET_LENGTH(string), keepends);
10220 break;
10221 case PyUnicode_4BYTE_KIND:
10222 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 PyUnicode_GET_LENGTH(string), keepends);
10225 break;
10226 default:
10227 assert(0);
10228 list = 0;
10229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231}
10232
Alexander Belopolsky40018472011-02-26 01:02:56 +000010233static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010234split(PyObject *self,
10235 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010238 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 void *buf1, *buf2;
10240 Py_ssize_t len1, len2;
10241 PyObject* out;
10242
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010244 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (PyUnicode_READY(self) == -1)
10247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010250 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010252 if (PyUnicode_IS_ASCII(self))
10253 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 PyUnicode_GET_LENGTH(self), maxcount
10256 );
10257 else
10258 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010259 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 PyUnicode_GET_LENGTH(self), maxcount
10261 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 case PyUnicode_2BYTE_KIND:
10263 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
10267 case PyUnicode_4BYTE_KIND:
10268 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
10272 default:
10273 assert(0);
10274 return NULL;
10275 }
10276
10277 if (PyUnicode_READY(substring) == -1)
10278 return NULL;
10279
10280 kind1 = PyUnicode_KIND(self);
10281 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 len1 = PyUnicode_GET_LENGTH(self);
10283 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010284 if (kind1 < kind2 || len1 < len2) {
10285 out = PyList_New(1);
10286 if (out == NULL)
10287 return NULL;
10288 Py_INCREF(self);
10289 PyList_SET_ITEM(out, 0, self);
10290 return out;
10291 }
10292 buf1 = PyUnicode_DATA(self);
10293 buf2 = PyUnicode_DATA(substring);
10294 if (kind2 != kind1) {
10295 buf2 = _PyUnicode_AsKind(substring, kind1);
10296 if (!buf2)
10297 return NULL;
10298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010300 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10303 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010305 else
10306 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010307 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 break;
10309 case PyUnicode_2BYTE_KIND:
10310 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 break;
10313 case PyUnicode_4BYTE_KIND:
10314 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 default:
10318 out = NULL;
10319 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010320 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 PyMem_Free(buf2);
10322 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323}
10324
Alexander Belopolsky40018472011-02-26 01:02:56 +000010325static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010326rsplit(PyObject *self,
10327 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010328 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010329{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010330 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 void *buf1, *buf2;
10332 Py_ssize_t len1, len2;
10333 PyObject* out;
10334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010335 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010336 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010342 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 if (PyUnicode_IS_ASCII(self))
10345 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010347 PyUnicode_GET_LENGTH(self), maxcount
10348 );
10349 else
10350 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010351 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352 PyUnicode_GET_LENGTH(self), maxcount
10353 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 case PyUnicode_2BYTE_KIND:
10355 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 PyUnicode_GET_LENGTH(self), maxcount
10358 );
10359 case PyUnicode_4BYTE_KIND:
10360 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 PyUnicode_GET_LENGTH(self), maxcount
10363 );
10364 default:
10365 assert(0);
10366 return NULL;
10367 }
10368
10369 if (PyUnicode_READY(substring) == -1)
10370 return NULL;
10371
10372 kind1 = PyUnicode_KIND(self);
10373 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 len1 = PyUnicode_GET_LENGTH(self);
10375 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010376 if (kind1 < kind2 || len1 < len2) {
10377 out = PyList_New(1);
10378 if (out == NULL)
10379 return NULL;
10380 Py_INCREF(self);
10381 PyList_SET_ITEM(out, 0, self);
10382 return out;
10383 }
10384 buf1 = PyUnicode_DATA(self);
10385 buf2 = PyUnicode_DATA(substring);
10386 if (kind2 != kind1) {
10387 buf2 = _PyUnicode_AsKind(substring, kind1);
10388 if (!buf2)
10389 return NULL;
10390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010392 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10395 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010396 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 else
10398 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 break;
10401 case PyUnicode_2BYTE_KIND:
10402 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 case PyUnicode_4BYTE_KIND:
10406 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 default:
10410 out = NULL;
10411 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010412 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 PyMem_Free(buf2);
10414 return out;
10415}
10416
10417static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010418anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10419 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010421 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10424 return asciilib_find(buf1, len1, buf2, len2, offset);
10425 else
10426 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 case PyUnicode_2BYTE_KIND:
10428 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10429 case PyUnicode_4BYTE_KIND:
10430 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10431 }
10432 assert(0);
10433 return -1;
10434}
10435
10436static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010437anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10438 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010440 switch (kind) {
10441 case PyUnicode_1BYTE_KIND:
10442 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10443 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10444 else
10445 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10446 case PyUnicode_2BYTE_KIND:
10447 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10448 case PyUnicode_4BYTE_KIND:
10449 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10450 }
10451 assert(0);
10452 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010453}
10454
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010455static void
10456replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10457 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10458{
10459 int kind = PyUnicode_KIND(u);
10460 void *data = PyUnicode_DATA(u);
10461 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10462 if (kind == PyUnicode_1BYTE_KIND) {
10463 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10464 (Py_UCS1 *)data + len,
10465 u1, u2, maxcount);
10466 }
10467 else if (kind == PyUnicode_2BYTE_KIND) {
10468 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10469 (Py_UCS2 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472 else {
10473 assert(kind == PyUnicode_4BYTE_KIND);
10474 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10475 (Py_UCS4 *)data + len,
10476 u1, u2, maxcount);
10477 }
10478}
10479
Alexander Belopolsky40018472011-02-26 01:02:56 +000010480static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481replace(PyObject *self, PyObject *str1,
10482 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 PyObject *u;
10485 char *sbuf = PyUnicode_DATA(self);
10486 char *buf1 = PyUnicode_DATA(str1);
10487 char *buf2 = PyUnicode_DATA(str2);
10488 int srelease = 0, release1 = 0, release2 = 0;
10489 int skind = PyUnicode_KIND(self);
10490 int kind1 = PyUnicode_KIND(str1);
10491 int kind2 = PyUnicode_KIND(str2);
10492 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10493 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10494 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010495 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010496 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
10498 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010501 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Victor Stinner59de0ee2011-10-07 10:01:28 +020010503 if (str1 == str2)
10504 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010507 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10508 if (maxchar < maxchar_str1)
10509 /* substring too wide to be present */
10510 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010511 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10512 /* Replacing str1 with str2 may cause a maxchar reduction in the
10513 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010514 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010515 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010518 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010520 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010523 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010524 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010525
Victor Stinner69ed0f42013-04-09 21:48:24 +020010526 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010528 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010529 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010530 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010534
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010535 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10536 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010537 }
10538 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 int rkind = skind;
10540 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010541 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (kind1 < rkind) {
10544 /* widen substring */
10545 buf1 = _PyUnicode_AsKind(str1, rkind);
10546 if (!buf1) goto error;
10547 release1 = 1;
10548 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010549 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550 if (i < 0)
10551 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (rkind > kind2) {
10553 /* widen replacement */
10554 buf2 = _PyUnicode_AsKind(str2, rkind);
10555 if (!buf2) goto error;
10556 release2 = 1;
10557 }
10558 else if (rkind < kind2) {
10559 /* widen self and buf1 */
10560 rkind = kind2;
10561 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010562 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 sbuf = _PyUnicode_AsKind(self, rkind);
10564 if (!sbuf) goto error;
10565 srelease = 1;
10566 buf1 = _PyUnicode_AsKind(str1, rkind);
10567 if (!buf1) goto error;
10568 release1 = 1;
10569 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010570 u = PyUnicode_New(slen, maxchar);
10571 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 assert(PyUnicode_KIND(u) == rkind);
10574 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010575
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010576 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010577 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010578 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010582
10583 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010584 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 if (i == -1)
10588 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 }
10596 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010598 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 int rkind = skind;
10600 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 buf1 = _PyUnicode_AsKind(str1, rkind);
10605 if (!buf1) goto error;
10606 release1 = 1;
10607 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010608 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 if (n == 0)
10610 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 buf2 = _PyUnicode_AsKind(str2, rkind);
10614 if (!buf2) goto error;
10615 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 rkind = kind2;
10620 sbuf = _PyUnicode_AsKind(self, rkind);
10621 if (!sbuf) goto error;
10622 srelease = 1;
10623 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010624 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 buf1 = _PyUnicode_AsKind(str1, rkind);
10626 if (!buf1) goto error;
10627 release1 = 1;
10628 }
10629 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10630 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010631 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 PyErr_SetString(PyExc_OverflowError,
10633 "replace string is too long");
10634 goto error;
10635 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010636 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010637 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010638 _Py_INCREF_UNICODE_EMPTY();
10639 if (!unicode_empty)
10640 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010641 u = unicode_empty;
10642 goto done;
10643 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010644 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 PyErr_SetString(PyExc_OverflowError,
10646 "replace string is too long");
10647 goto error;
10648 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 u = PyUnicode_New(new_size, maxchar);
10650 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 assert(PyUnicode_KIND(u) == rkind);
10653 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 ires = i = 0;
10655 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 while (n-- > 0) {
10657 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010658 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010660 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010661 if (j == -1)
10662 break;
10663 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 memcpy(res + rkind * ires,
10666 sbuf + rkind * i,
10667 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 }
10670 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010672 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010681 memcpy(res + rkind * ires,
10682 sbuf + rkind * i,
10683 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010684 }
10685 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 /* interleave */
10687 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010688 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010690 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 if (--n <= 0)
10693 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
10695 sbuf + rkind * i,
10696 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 ires++;
10698 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010700 memcpy(res + rkind * ires,
10701 sbuf + rkind * i,
10702 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 }
10705
10706 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010707 unicode_adjust_maxchar(&u);
10708 if (u == NULL)
10709 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010711
10712 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (srelease)
10714 PyMem_FREE(sbuf);
10715 if (release1)
10716 PyMem_FREE(buf1);
10717 if (release2)
10718 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010719 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010721
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010723 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (srelease)
10725 PyMem_FREE(sbuf);
10726 if (release1)
10727 PyMem_FREE(buf1);
10728 if (release2)
10729 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010730 return unicode_result_unchanged(self);
10731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 error:
10733 if (srelease && sbuf)
10734 PyMem_FREE(sbuf);
10735 if (release1 && buf1)
10736 PyMem_FREE(buf1);
10737 if (release2 && buf2)
10738 PyMem_FREE(buf2);
10739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740}
10741
10742/* --- Unicode Object Methods --------------------------------------------- */
10743
INADA Naoki3ae20562017-01-16 20:41:20 +090010744/*[clinic input]
10745str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
INADA Naoki3ae20562017-01-16 20:41:20 +090010747Return a version of the string where each word is titlecased.
10748
10749More specifically, words start with uppercased characters and all remaining
10750cased characters have lower case.
10751[clinic start generated code]*/
10752
10753static PyObject *
10754unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010755/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010757 if (PyUnicode_READY(self) == -1)
10758 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010759 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760}
10761
INADA Naoki3ae20562017-01-16 20:41:20 +090010762/*[clinic input]
10763str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
INADA Naoki3ae20562017-01-16 20:41:20 +090010765Return a capitalized version of the string.
10766
10767More specifically, make the first character have upper case and the rest lower
10768case.
10769[clinic start generated code]*/
10770
10771static PyObject *
10772unicode_capitalize_impl(PyObject *self)
10773/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010775 if (PyUnicode_READY(self) == -1)
10776 return NULL;
10777 if (PyUnicode_GET_LENGTH(self) == 0)
10778 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010779 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780}
10781
INADA Naoki3ae20562017-01-16 20:41:20 +090010782/*[clinic input]
10783str.casefold as unicode_casefold
10784
10785Return a version of the string suitable for caseless comparisons.
10786[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010787
10788static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010789unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010790/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010791{
10792 if (PyUnicode_READY(self) == -1)
10793 return NULL;
10794 if (PyUnicode_IS_ASCII(self))
10795 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010796 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010797}
10798
10799
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010800/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010801
10802static int
10803convert_uc(PyObject *obj, void *addr)
10804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 if (!PyUnicode_Check(obj)) {
10808 PyErr_Format(PyExc_TypeError,
10809 "The fill character must be a unicode character, "
10810 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010811 return 0;
10812 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010813 if (PyUnicode_READY(obj) < 0)
10814 return 0;
10815 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818 return 0;
10819 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010820 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010821 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010822}
10823
INADA Naoki3ae20562017-01-16 20:41:20 +090010824/*[clinic input]
10825str.center as unicode_center
10826
10827 width: Py_ssize_t
10828 fillchar: Py_UCS4 = ' '
10829 /
10830
10831Return a centered string of length width.
10832
10833Padding is done using the specified fill character (default is a space).
10834[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010837unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10838/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010840 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Benjamin Petersonbac79492012-01-14 13:34:47 -050010842 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 return NULL;
10844
Victor Stinnerc4b49542011-12-11 22:44:26 +010010845 if (PyUnicode_GET_LENGTH(self) >= width)
10846 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
Victor Stinnerc4b49542011-12-11 22:44:26 +010010848 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 left = marg / 2 + (marg & width & 1);
10850
Victor Stinner9310abb2011-10-05 00:59:23 +020010851 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852}
10853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854/* This function assumes that str1 and str2 are readied by the caller. */
10855
Marc-André Lemburge5034372000-08-08 08:04:29 +000010856static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010857unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010858{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859#define COMPARE(TYPE1, TYPE2) \
10860 do { \
10861 TYPE1* p1 = (TYPE1 *)data1; \
10862 TYPE2* p2 = (TYPE2 *)data2; \
10863 TYPE1* end = p1 + len; \
10864 Py_UCS4 c1, c2; \
10865 for (; p1 != end; p1++, p2++) { \
10866 c1 = *p1; \
10867 c2 = *p2; \
10868 if (c1 != c2) \
10869 return (c1 < c2) ? -1 : 1; \
10870 } \
10871 } \
10872 while (0)
10873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 int kind1, kind2;
10875 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010876 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 kind1 = PyUnicode_KIND(str1);
10879 kind2 = PyUnicode_KIND(str2);
10880 data1 = PyUnicode_DATA(str1);
10881 data2 = PyUnicode_DATA(str2);
10882 len1 = PyUnicode_GET_LENGTH(str1);
10883 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010884 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010885
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886 switch(kind1) {
10887 case PyUnicode_1BYTE_KIND:
10888 {
10889 switch(kind2) {
10890 case PyUnicode_1BYTE_KIND:
10891 {
10892 int cmp = memcmp(data1, data2, len);
10893 /* normalize result of memcmp() into the range [-1; 1] */
10894 if (cmp < 0)
10895 return -1;
10896 if (cmp > 0)
10897 return 1;
10898 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010899 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010900 case PyUnicode_2BYTE_KIND:
10901 COMPARE(Py_UCS1, Py_UCS2);
10902 break;
10903 case PyUnicode_4BYTE_KIND:
10904 COMPARE(Py_UCS1, Py_UCS4);
10905 break;
10906 default:
10907 assert(0);
10908 }
10909 break;
10910 }
10911 case PyUnicode_2BYTE_KIND:
10912 {
10913 switch(kind2) {
10914 case PyUnicode_1BYTE_KIND:
10915 COMPARE(Py_UCS2, Py_UCS1);
10916 break;
10917 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010918 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010919 COMPARE(Py_UCS2, Py_UCS2);
10920 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010921 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 case PyUnicode_4BYTE_KIND:
10923 COMPARE(Py_UCS2, Py_UCS4);
10924 break;
10925 default:
10926 assert(0);
10927 }
10928 break;
10929 }
10930 case PyUnicode_4BYTE_KIND:
10931 {
10932 switch(kind2) {
10933 case PyUnicode_1BYTE_KIND:
10934 COMPARE(Py_UCS4, Py_UCS1);
10935 break;
10936 case PyUnicode_2BYTE_KIND:
10937 COMPARE(Py_UCS4, Py_UCS2);
10938 break;
10939 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010940 {
10941#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10942 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10943 /* normalize result of wmemcmp() into the range [-1; 1] */
10944 if (cmp < 0)
10945 return -1;
10946 if (cmp > 0)
10947 return 1;
10948#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010949 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010950#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010952 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953 default:
10954 assert(0);
10955 }
10956 break;
10957 }
10958 default:
10959 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010960 }
10961
Victor Stinner770e19e2012-10-04 22:59:45 +020010962 if (len1 == len2)
10963 return 0;
10964 if (len1 < len2)
10965 return -1;
10966 else
10967 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010968
10969#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010970}
10971
Benjamin Peterson621b4302016-09-09 13:54:34 -070010972static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010973unicode_compare_eq(PyObject *str1, PyObject *str2)
10974{
10975 int kind;
10976 void *data1, *data2;
10977 Py_ssize_t len;
10978 int cmp;
10979
Victor Stinnere5567ad2012-10-23 02:48:49 +020010980 len = PyUnicode_GET_LENGTH(str1);
10981 if (PyUnicode_GET_LENGTH(str2) != len)
10982 return 0;
10983 kind = PyUnicode_KIND(str1);
10984 if (PyUnicode_KIND(str2) != kind)
10985 return 0;
10986 data1 = PyUnicode_DATA(str1);
10987 data2 = PyUnicode_DATA(str2);
10988
10989 cmp = memcmp(data1, data2, len * kind);
10990 return (cmp == 0);
10991}
10992
10993
Alexander Belopolsky40018472011-02-26 01:02:56 +000010994int
10995PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10998 if (PyUnicode_READY(left) == -1 ||
10999 PyUnicode_READY(right) == -1)
11000 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011001
11002 /* a string is equal to itself */
11003 if (left == right)
11004 return 0;
11005
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011008 PyErr_Format(PyExc_TypeError,
11009 "Can't compare %.100s and %.100s",
11010 left->ob_type->tp_name,
11011 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 return -1;
11013}
11014
Martin v. Löwis5b222132007-06-10 09:51:05 +000011015int
11016PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 Py_ssize_t i;
11019 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011021 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022
Victor Stinner910337b2011-10-03 03:20:16 +020011023 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011024 if (!PyUnicode_IS_READY(uni)) {
11025 const wchar_t *ws = _PyUnicode_WSTR(uni);
11026 /* Compare Unicode string and source character set string */
11027 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11028 if (chr != ustr[i])
11029 return (chr < ustr[i]) ? -1 : 1;
11030 }
11031 /* This check keeps Python strings that end in '\0' from comparing equal
11032 to C strings identical up to that point. */
11033 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11034 return 1; /* uni is longer */
11035 if (ustr[i])
11036 return -1; /* str is longer */
11037 return 0;
11038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011040 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011041 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011042 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011043 size_t len, len2 = strlen(str);
11044 int cmp;
11045
11046 len = Py_MIN(len1, len2);
11047 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011048 if (cmp != 0) {
11049 if (cmp < 0)
11050 return -1;
11051 else
11052 return 1;
11053 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011054 if (len1 > len2)
11055 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011056 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011057 return -1; /* str is longer */
11058 return 0;
11059 }
11060 else {
11061 void *data = PyUnicode_DATA(uni);
11062 /* Compare Unicode string and source character set string */
11063 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011064 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011065 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11066 /* This check keeps Python strings that end in '\0' from comparing equal
11067 to C strings identical up to that point. */
11068 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11069 return 1; /* uni is longer */
11070 if (str[i])
11071 return -1; /* str is longer */
11072 return 0;
11073 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011074}
11075
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011076static int
11077non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11078{
11079 size_t i, len;
11080 const wchar_t *p;
11081 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11082 if (strlen(str) != len)
11083 return 0;
11084 p = _PyUnicode_WSTR(unicode);
11085 assert(p);
11086 for (i = 0; i < len; i++) {
11087 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011088 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011089 return 0;
11090 }
11091 return 1;
11092}
11093
11094int
11095_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11096{
11097 size_t len;
11098 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011099 assert(str);
11100#ifndef NDEBUG
11101 for (const char *p = str; *p; p++) {
11102 assert((unsigned char)*p < 128);
11103 }
11104#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011105 if (PyUnicode_READY(unicode) == -1) {
11106 /* Memory error or bad data */
11107 PyErr_Clear();
11108 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11109 }
11110 if (!PyUnicode_IS_ASCII(unicode))
11111 return 0;
11112 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11113 return strlen(str) == len &&
11114 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11115}
11116
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011117int
11118_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11119{
11120 PyObject *right_uni;
11121 Py_hash_t hash;
11122
11123 assert(_PyUnicode_CHECK(left));
11124 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011125#ifndef NDEBUG
11126 for (const char *p = right->string; *p; p++) {
11127 assert((unsigned char)*p < 128);
11128 }
11129#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011130
11131 if (PyUnicode_READY(left) == -1) {
11132 /* memory error or bad data */
11133 PyErr_Clear();
11134 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11135 }
11136
11137 if (!PyUnicode_IS_ASCII(left))
11138 return 0;
11139
11140 right_uni = _PyUnicode_FromId(right); /* borrowed */
11141 if (right_uni == NULL) {
11142 /* memory error or bad data */
11143 PyErr_Clear();
11144 return _PyUnicode_EqualToASCIIString(left, right->string);
11145 }
11146
11147 if (left == right_uni)
11148 return 1;
11149
11150 if (PyUnicode_CHECK_INTERNED(left))
11151 return 0;
11152
11153 assert(_PyUnicode_HASH(right_uni) != 1);
11154 hash = _PyUnicode_HASH(left);
11155 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11156 return 0;
11157
11158 return unicode_compare_eq(left, right_uni);
11159}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011160
Benjamin Peterson29060642009-01-31 22:14:21 +000011161#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011163
Alexander Belopolsky40018472011-02-26 01:02:56 +000011164PyObject *
11165PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011166{
11167 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011168 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011169
Victor Stinnere5567ad2012-10-23 02:48:49 +020011170 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11171 Py_RETURN_NOTIMPLEMENTED;
11172
11173 if (PyUnicode_READY(left) == -1 ||
11174 PyUnicode_READY(right) == -1)
11175 return NULL;
11176
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011177 if (left == right) {
11178 switch (op) {
11179 case Py_EQ:
11180 case Py_LE:
11181 case Py_GE:
11182 /* a string is equal to itself */
11183 v = Py_True;
11184 break;
11185 case Py_NE:
11186 case Py_LT:
11187 case Py_GT:
11188 v = Py_False;
11189 break;
11190 default:
11191 PyErr_BadArgument();
11192 return NULL;
11193 }
11194 }
11195 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011196 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011197 result ^= (op == Py_NE);
11198 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011199 }
11200 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011201 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011202
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011203 /* Convert the return value to a Boolean */
11204 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011205 case Py_LE:
11206 v = TEST_COND(result <= 0);
11207 break;
11208 case Py_GE:
11209 v = TEST_COND(result >= 0);
11210 break;
11211 case Py_LT:
11212 v = TEST_COND(result == -1);
11213 break;
11214 case Py_GT:
11215 v = TEST_COND(result == 1);
11216 break;
11217 default:
11218 PyErr_BadArgument();
11219 return NULL;
11220 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011221 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011222 Py_INCREF(v);
11223 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011224}
11225
Alexander Belopolsky40018472011-02-26 01:02:56 +000011226int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011227_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11228{
11229 return unicode_eq(aa, bb);
11230}
11231
11232int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011234{
Victor Stinner77282cb2013-04-14 19:22:47 +020011235 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 void *buf1, *buf2;
11237 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011238 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011239
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 "'in <string>' requires string as left operand, not %.100s",
11243 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011244 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011245 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011246 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011247 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011248 if (ensure_unicode(str) < 0)
11249 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011252 kind2 = PyUnicode_KIND(substr);
11253 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011254 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011256 len2 = PyUnicode_GET_LENGTH(substr);
11257 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011258 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011259 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011260 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011261 if (len2 == 1) {
11262 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11263 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011264 return result;
11265 }
11266 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011267 buf2 = _PyUnicode_AsKind(substr, kind1);
11268 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011269 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271
Victor Stinner77282cb2013-04-14 19:22:47 +020011272 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 case PyUnicode_1BYTE_KIND:
11274 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11275 break;
11276 case PyUnicode_2BYTE_KIND:
11277 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11278 break;
11279 case PyUnicode_4BYTE_KIND:
11280 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11281 break;
11282 default:
11283 result = -1;
11284 assert(0);
11285 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011286
Victor Stinner77282cb2013-04-14 19:22:47 +020011287 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 PyMem_Free(buf2);
11289
Guido van Rossum403d68b2000-03-13 15:55:09 +000011290 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011291}
11292
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293/* Concat to string or Unicode object giving a new Unicode object. */
11294
Alexander Belopolsky40018472011-02-26 01:02:56 +000011295PyObject *
11296PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011299 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011302 if (ensure_unicode(left) < 0)
11303 return NULL;
11304
11305 if (!PyUnicode_Check(right)) {
11306 PyErr_Format(PyExc_TypeError,
11307 "can only concatenate str (not \"%.200s\") to str",
11308 right->ob_type->tp_name);
11309 return NULL;
11310 }
11311 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011315 if (left == unicode_empty)
11316 return PyUnicode_FromObject(right);
11317 if (right == unicode_empty)
11318 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011320 left_len = PyUnicode_GET_LENGTH(left);
11321 right_len = PyUnicode_GET_LENGTH(right);
11322 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011323 PyErr_SetString(PyExc_OverflowError,
11324 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011326 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011328
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011329 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11330 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011331 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011334 result = PyUnicode_New(new_len, maxchar);
11335 if (result == NULL)
11336 return NULL;
11337 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11338 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11339 assert(_PyUnicode_CheckConsistency(result, 1));
11340 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341}
11342
Walter Dörwald1ab83302007-05-18 17:15:44 +000011343void
Victor Stinner23e56682011-10-03 03:54:37 +020011344PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011345{
Victor Stinner23e56682011-10-03 03:54:37 +020011346 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011347 Py_UCS4 maxchar, maxchar2;
11348 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011349
11350 if (p_left == NULL) {
11351 if (!PyErr_Occurred())
11352 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 return;
11354 }
Victor Stinner23e56682011-10-03 03:54:37 +020011355 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011356 if (right == NULL || left == NULL
11357 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011358 if (!PyErr_Occurred())
11359 PyErr_BadInternalCall();
11360 goto error;
11361 }
11362
Benjamin Petersonbac79492012-01-14 13:34:47 -050011363 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011364 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011365 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011366 goto error;
11367
Victor Stinner488fa492011-12-12 00:01:39 +010011368 /* Shortcuts */
11369 if (left == unicode_empty) {
11370 Py_DECREF(left);
11371 Py_INCREF(right);
11372 *p_left = right;
11373 return;
11374 }
11375 if (right == unicode_empty)
11376 return;
11377
11378 left_len = PyUnicode_GET_LENGTH(left);
11379 right_len = PyUnicode_GET_LENGTH(right);
11380 if (left_len > PY_SSIZE_T_MAX - right_len) {
11381 PyErr_SetString(PyExc_OverflowError,
11382 "strings are too large to concat");
11383 goto error;
11384 }
11385 new_len = left_len + right_len;
11386
11387 if (unicode_modifiable(left)
11388 && PyUnicode_CheckExact(right)
11389 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011390 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11391 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011392 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011393 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011394 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11395 {
11396 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011397 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011398 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011399
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011400 /* copy 'right' into the newly allocated area of 'left' */
11401 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011402 }
Victor Stinner488fa492011-12-12 00:01:39 +010011403 else {
11404 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11405 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011406 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011407
Victor Stinner488fa492011-12-12 00:01:39 +010011408 /* Concat the two Unicode strings */
11409 res = PyUnicode_New(new_len, maxchar);
11410 if (res == NULL)
11411 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011412 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11413 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011414 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011415 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011416 }
11417 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011418 return;
11419
11420error:
Victor Stinner488fa492011-12-12 00:01:39 +010011421 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011422}
11423
11424void
11425PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11426{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011427 PyUnicode_Append(pleft, right);
11428 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011429}
11430
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011431/*
11432Wraps stringlib_parse_args_finds() and additionally ensures that the
11433first argument is a unicode object.
11434*/
11435
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011436static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011437parse_args_finds_unicode(const char * function_name, PyObject *args,
11438 PyObject **substring,
11439 Py_ssize_t *start, Py_ssize_t *end)
11440{
11441 if(stringlib_parse_args_finds(function_name, args, substring,
11442 start, end)) {
11443 if (ensure_unicode(*substring) < 0)
11444 return 0;
11445 return 1;
11446 }
11447 return 0;
11448}
11449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011450PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011454string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
11457static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011458unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011460 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011461 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011462 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 void *buf1, *buf2;
11466 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011468 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 kind1 = PyUnicode_KIND(self);
11472 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011473 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011474 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 len1 = PyUnicode_GET_LENGTH(self);
11477 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011479 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011480 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011481
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 buf1 = PyUnicode_DATA(self);
11483 buf2 = PyUnicode_DATA(substring);
11484 if (kind2 != kind1) {
11485 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011486 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011487 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011488 }
11489 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 case PyUnicode_1BYTE_KIND:
11491 iresult = ucs1lib_count(
11492 ((Py_UCS1*)buf1) + start, end - start,
11493 buf2, len2, PY_SSIZE_T_MAX
11494 );
11495 break;
11496 case PyUnicode_2BYTE_KIND:
11497 iresult = ucs2lib_count(
11498 ((Py_UCS2*)buf1) + start, end - start,
11499 buf2, len2, PY_SSIZE_T_MAX
11500 );
11501 break;
11502 case PyUnicode_4BYTE_KIND:
11503 iresult = ucs4lib_count(
11504 ((Py_UCS4*)buf1) + start, end - start,
11505 buf2, len2, PY_SSIZE_T_MAX
11506 );
11507 break;
11508 default:
11509 assert(0); iresult = 0;
11510 }
11511
11512 result = PyLong_FromSsize_t(iresult);
11513
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011514 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 return result;
11518}
11519
INADA Naoki3ae20562017-01-16 20:41:20 +090011520/*[clinic input]
11521str.encode as unicode_encode
11522
11523 encoding: str(c_default="NULL") = 'utf-8'
11524 The encoding in which to encode the string.
11525 errors: str(c_default="NULL") = 'strict'
11526 The error handling scheme to use for encoding errors.
11527 The default is 'strict' meaning that encoding errors raise a
11528 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11529 'xmlcharrefreplace' as well as any other name registered with
11530 codecs.register_error that can handle UnicodeEncodeErrors.
11531
11532Encode the string using the codec registered for encoding.
11533[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011536unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011537/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011539 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011540}
11541
INADA Naoki3ae20562017-01-16 20:41:20 +090011542/*[clinic input]
11543str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
INADA Naoki3ae20562017-01-16 20:41:20 +090011545 tabsize: int = 8
11546
11547Return a copy where all tab characters are expanded using spaces.
11548
11549If tabsize is not given, a tab size of 8 characters is assumed.
11550[clinic start generated code]*/
11551
11552static PyObject *
11553unicode_expandtabs_impl(PyObject *self, int tabsize)
11554/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 Py_ssize_t i, j, line_pos, src_len, incr;
11557 Py_UCS4 ch;
11558 PyObject *u;
11559 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011561 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
Antoine Pitrou22425222011-10-04 19:10:51 +020011563 if (PyUnicode_READY(self) == -1)
11564 return NULL;
11565
Thomas Wouters7e474022000-07-16 12:04:32 +000011566 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011567 src_len = PyUnicode_GET_LENGTH(self);
11568 i = j = line_pos = 0;
11569 kind = PyUnicode_KIND(self);
11570 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011571 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 for (; i < src_len; i++) {
11573 ch = PyUnicode_READ(kind, src_data, i);
11574 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011575 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011577 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 goto overflow;
11580 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011582 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 goto overflow;
11587 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 if (ch == '\n' || ch == '\r')
11590 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011593 if (!found)
11594 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011595
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 if (!u)
11599 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
Antoine Pitroue71d5742011-10-04 15:55:09 +020011602 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 for (; i < src_len; i++) {
11605 ch = PyUnicode_READ(kind, src_data, i);
11606 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 incr = tabsize - (line_pos % tabsize);
11609 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011610 FILL(kind, dest_data, ' ', j, incr);
11611 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011613 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011615 line_pos++;
11616 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011617 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011618 if (ch == '\n' || ch == '\r')
11619 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011621 }
11622 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011623 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011624
Antoine Pitroue71d5742011-10-04 15:55:09 +020011625 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011626 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628}
11629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011630PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632\n\
11633Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011634such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635arguments start and end are interpreted as in slice notation.\n\
11636\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011637Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
11639static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011642 /* initialize variables to prevent gcc warning */
11643 PyObject *substring = NULL;
11644 Py_ssize_t start = 0;
11645 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011646 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011648 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011651 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011654 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (result == -2)
11657 return NULL;
11658
Christian Heimes217cfd12007-12-02 14:31:20 +000011659 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
11662static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011663unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011665 void *data;
11666 enum PyUnicode_Kind kind;
11667 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011668
11669 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11670 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011672 }
11673 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11674 PyErr_SetString(PyExc_IndexError, "string index out of range");
11675 return NULL;
11676 }
11677 kind = PyUnicode_KIND(self);
11678 data = PyUnicode_DATA(self);
11679 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011680 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Guido van Rossumc2504932007-09-18 19:42:40 +000011683/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011684 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011685static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011686unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687{
Guido van Rossumc2504932007-09-18 19:42:40 +000011688 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011689 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011690
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011691#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011692 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011693#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (_PyUnicode_HASH(self) != -1)
11695 return _PyUnicode_HASH(self);
11696 if (PyUnicode_READY(self) == -1)
11697 return -1;
11698 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011699 /*
11700 We make the hash of the empty string be 0, rather than using
11701 (prefix ^ suffix), since this slightly obfuscates the hash secret
11702 */
11703 if (len == 0) {
11704 _PyUnicode_HASH(self) = 0;
11705 return 0;
11706 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011707 x = _Py_HashBytes(PyUnicode_DATA(self),
11708 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011710 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711}
11712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011713PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011716Return the lowest index in S where substring sub is found, \n\
11717such that sub is contained within S[start:end]. Optional\n\
11718arguments start and end are interpreted as in slice notation.\n\
11719\n\
11720Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
11722static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011725 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011726 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011727 PyObject *substring = NULL;
11728 Py_ssize_t start = 0;
11729 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011731 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011734 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011737 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (result == -2)
11740 return NULL;
11741
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 if (result < 0) {
11743 PyErr_SetString(PyExc_ValueError, "substring not found");
11744 return NULL;
11745 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746
Christian Heimes217cfd12007-12-02 14:31:20 +000011747 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748}
11749
INADA Naoki3ae20562017-01-16 20:41:20 +090011750/*[clinic input]
11751str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
INADA Naoki3ae20562017-01-16 20:41:20 +090011753Return True if the string is a lowercase string, False otherwise.
11754
11755A string is lowercase if all cased characters in the string are lowercase and
11756there is at least one cased character in the string.
11757[clinic start generated code]*/
11758
11759static PyObject *
11760unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011761/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 Py_ssize_t i, length;
11764 int kind;
11765 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 int cased;
11767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 if (PyUnicode_READY(self) == -1)
11769 return NULL;
11770 length = PyUnicode_GET_LENGTH(self);
11771 kind = PyUnicode_KIND(self);
11772 data = PyUnicode_DATA(self);
11773
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (length == 1)
11776 return PyBool_FromLong(
11777 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011781 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011782
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 for (i = 0; i < length; i++) {
11785 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011786
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011788 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 else if (!cased && Py_UNICODE_ISLOWER(ch))
11790 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011792 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
INADA Naoki3ae20562017-01-16 20:41:20 +090011795/*[clinic input]
11796str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
INADA Naoki3ae20562017-01-16 20:41:20 +090011798Return True if the string is an uppercase string, False otherwise.
11799
11800A string is uppercase if all cased characters in the string are uppercase and
11801there is at least one cased character in the string.
11802[clinic start generated code]*/
11803
11804static PyObject *
11805unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011806/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 Py_ssize_t i, length;
11809 int kind;
11810 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 int cased;
11812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (PyUnicode_READY(self) == -1)
11814 return NULL;
11815 length = PyUnicode_GET_LENGTH(self);
11816 kind = PyUnicode_KIND(self);
11817 data = PyUnicode_DATA(self);
11818
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (length == 1)
11821 return PyBool_FromLong(
11822 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011824 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011826 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011827
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 for (i = 0; i < length; i++) {
11830 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011831
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011833 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 else if (!cased && Py_UNICODE_ISUPPER(ch))
11835 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011837 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838}
11839
INADA Naoki3ae20562017-01-16 20:41:20 +090011840/*[clinic input]
11841str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
INADA Naoki3ae20562017-01-16 20:41:20 +090011843Return True if the string is a title-cased string, False otherwise.
11844
11845In a title-cased string, upper- and title-case characters may only
11846follow uncased characters and lowercase characters only cased ones.
11847[clinic start generated code]*/
11848
11849static PyObject *
11850unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011851/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 Py_ssize_t i, length;
11854 int kind;
11855 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856 int cased, previous_is_cased;
11857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 if (PyUnicode_READY(self) == -1)
11859 return NULL;
11860 length = PyUnicode_GET_LENGTH(self);
11861 kind = PyUnicode_KIND(self);
11862 data = PyUnicode_DATA(self);
11863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (length == 1) {
11866 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11867 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11868 (Py_UNICODE_ISUPPER(ch) != 0));
11869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011871 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011873 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011874
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 cased = 0;
11876 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 for (i = 0; i < length; i++) {
11878 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011879
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11881 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011882 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 previous_is_cased = 1;
11884 cased = 1;
11885 }
11886 else if (Py_UNICODE_ISLOWER(ch)) {
11887 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011888 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 previous_is_cased = 1;
11890 cased = 1;
11891 }
11892 else
11893 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011895 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896}
11897
INADA Naoki3ae20562017-01-16 20:41:20 +090011898/*[clinic input]
11899str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901Return True if the string is a whitespace string, False otherwise.
11902
11903A string is whitespace if all characters in the string are whitespace and there
11904is at least one character in the string.
11905[clinic start generated code]*/
11906
11907static PyObject *
11908unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011909/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 Py_ssize_t i, length;
11912 int kind;
11913 void *data;
11914
11915 if (PyUnicode_READY(self) == -1)
11916 return NULL;
11917 length = PyUnicode_GET_LENGTH(self);
11918 kind = PyUnicode_KIND(self);
11919 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (length == 1)
11923 return PyBool_FromLong(
11924 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011926 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011928 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 for (i = 0; i < length; i++) {
11931 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011932 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011933 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011935 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
INADA Naoki3ae20562017-01-16 20:41:20 +090011938/*[clinic input]
11939str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940
INADA Naoki3ae20562017-01-16 20:41:20 +090011941Return True if the string is an alphabetic string, False otherwise.
11942
11943A string is alphabetic if all characters in the string are alphabetic and there
11944is at least one character in the string.
11945[clinic start generated code]*/
11946
11947static PyObject *
11948unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011949/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 Py_ssize_t i, length;
11952 int kind;
11953 void *data;
11954
11955 if (PyUnicode_READY(self) == -1)
11956 return NULL;
11957 length = PyUnicode_GET_LENGTH(self);
11958 kind = PyUnicode_KIND(self);
11959 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (length == 1)
11963 return PyBool_FromLong(
11964 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011965
11966 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011968 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 for (i = 0; i < length; i++) {
11971 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011972 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975}
11976
INADA Naoki3ae20562017-01-16 20:41:20 +090011977/*[clinic input]
11978str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011979
INADA Naoki3ae20562017-01-16 20:41:20 +090011980Return True if the string is an alpha-numeric string, False otherwise.
11981
11982A string is alpha-numeric if all characters in the string are alpha-numeric and
11983there is at least one character in the string.
11984[clinic start generated code]*/
11985
11986static PyObject *
11987unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011988/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 int kind;
11991 void *data;
11992 Py_ssize_t len, i;
11993
11994 if (PyUnicode_READY(self) == -1)
11995 return NULL;
11996
11997 kind = PyUnicode_KIND(self);
11998 data = PyUnicode_DATA(self);
11999 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012000
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (len == 1) {
12003 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12004 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12005 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012006
12007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 for (i = 0; i < len; i++) {
12012 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012013 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012014 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012015 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012016 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012017}
12018
INADA Naoki3ae20562017-01-16 20:41:20 +090012019/*[clinic input]
12020str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
INADA Naoki3ae20562017-01-16 20:41:20 +090012022Return True if the string is a decimal string, False otherwise.
12023
12024A string is a decimal string if all characters in the string are decimal and
12025there is at least one character in the string.
12026[clinic start generated code]*/
12027
12028static PyObject *
12029unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012030/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 Py_ssize_t i, length;
12033 int kind;
12034 void *data;
12035
12036 if (PyUnicode_READY(self) == -1)
12037 return NULL;
12038 length = PyUnicode_GET_LENGTH(self);
12039 kind = PyUnicode_KIND(self);
12040 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (length == 1)
12044 return PyBool_FromLong(
12045 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012047 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012049 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 for (i = 0; i < length; i++) {
12052 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012053 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056}
12057
INADA Naoki3ae20562017-01-16 20:41:20 +090012058/*[clinic input]
12059str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060
INADA Naoki3ae20562017-01-16 20:41:20 +090012061Return True if the string is a digit string, False otherwise.
12062
12063A string is a digit string if all characters in the string are digits and there
12064is at least one character in the string.
12065[clinic start generated code]*/
12066
12067static PyObject *
12068unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012069/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 Py_ssize_t i, length;
12072 int kind;
12073 void *data;
12074
12075 if (PyUnicode_READY(self) == -1)
12076 return NULL;
12077 length = PyUnicode_GET_LENGTH(self);
12078 kind = PyUnicode_KIND(self);
12079 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (length == 1) {
12083 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12084 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012087 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012089 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 for (i = 0; i < length; i++) {
12092 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012093 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012095 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096}
12097
INADA Naoki3ae20562017-01-16 20:41:20 +090012098/*[clinic input]
12099str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
INADA Naoki3ae20562017-01-16 20:41:20 +090012101Return True if the string is a numeric string, False otherwise.
12102
12103A string is numeric if all characters in the string are numeric and there is at
12104least one character in the string.
12105[clinic start generated code]*/
12106
12107static PyObject *
12108unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012109/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 Py_ssize_t i, length;
12112 int kind;
12113 void *data;
12114
12115 if (PyUnicode_READY(self) == -1)
12116 return NULL;
12117 length = PyUnicode_GET_LENGTH(self);
12118 kind = PyUnicode_KIND(self);
12119 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (length == 1)
12123 return PyBool_FromLong(
12124 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012126 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 for (i = 0; i < length; i++) {
12131 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012132 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012134 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Martin v. Löwis47383402007-08-15 07:32:56 +000012137int
12138PyUnicode_IsIdentifier(PyObject *self)
12139{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 int kind;
12141 void *data;
12142 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012143 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (PyUnicode_READY(self) == -1) {
12146 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 }
12149
12150 /* Special case for empty strings */
12151 if (PyUnicode_GET_LENGTH(self) == 0)
12152 return 0;
12153 kind = PyUnicode_KIND(self);
12154 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012155
12156 /* PEP 3131 says that the first character must be in
12157 XID_Start and subsequent characters in XID_Continue,
12158 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012159 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012160 letters, digits, underscore). However, given the current
12161 definition of XID_Start and XID_Continue, it is sufficient
12162 to check just for these, except that _ must be allowed
12163 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012165 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012166 return 0;
12167
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012168 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012171 return 1;
12172}
12173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174/*[clinic input]
12175str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177Return True if the string is a valid Python identifier, False otherwise.
12178
12179Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12180"class".
12181[clinic start generated code]*/
12182
12183static PyObject *
12184unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012185/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012186{
12187 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12188}
12189
INADA Naoki3ae20562017-01-16 20:41:20 +090012190/*[clinic input]
12191str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012192
INADA Naoki3ae20562017-01-16 20:41:20 +090012193Return True if the string is printable, False otherwise.
12194
12195A string is printable if all of its characters are considered printable in
12196repr() or if it is empty.
12197[clinic start generated code]*/
12198
12199static PyObject *
12200unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012201/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 Py_ssize_t i, length;
12204 int kind;
12205 void *data;
12206
12207 if (PyUnicode_READY(self) == -1)
12208 return NULL;
12209 length = PyUnicode_GET_LENGTH(self);
12210 kind = PyUnicode_KIND(self);
12211 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012212
12213 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (length == 1)
12215 return PyBool_FromLong(
12216 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 for (i = 0; i < length; i++) {
12219 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012220 Py_RETURN_FALSE;
12221 }
12222 }
12223 Py_RETURN_TRUE;
12224}
12225
INADA Naoki3ae20562017-01-16 20:41:20 +090012226/*[clinic input]
12227str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228
INADA Naoki3ae20562017-01-16 20:41:20 +090012229 iterable: object
12230 /
12231
12232Concatenate any number of strings.
12233
Martin Panter91a88662017-01-24 00:30:06 +000012234The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012235The result is returned as a new string.
12236
12237Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12238[clinic start generated code]*/
12239
12240static PyObject *
12241unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012242/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
INADA Naoki3ae20562017-01-16 20:41:20 +090012244 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245}
12246
Martin v. Löwis18e16552006-02-15 17:27:45 +000012247static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012248unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 if (PyUnicode_READY(self) == -1)
12251 return -1;
12252 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253}
12254
INADA Naoki3ae20562017-01-16 20:41:20 +090012255/*[clinic input]
12256str.ljust as unicode_ljust
12257
12258 width: Py_ssize_t
12259 fillchar: Py_UCS4 = ' '
12260 /
12261
12262Return a left-justified string of length width.
12263
12264Padding is done using the specified fill character (default is a space).
12265[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
12267static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012268unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12269/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012271 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
Victor Stinnerc4b49542011-12-11 22:44:26 +010012274 if (PyUnicode_GET_LENGTH(self) >= width)
12275 return unicode_result_unchanged(self);
12276
12277 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
INADA Naoki3ae20562017-01-16 20:41:20 +090012280/*[clinic input]
12281str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
INADA Naoki3ae20562017-01-16 20:41:20 +090012283Return a copy of the string converted to lowercase.
12284[clinic start generated code]*/
12285
12286static PyObject *
12287unicode_lower_impl(PyObject *self)
12288/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012290 if (PyUnicode_READY(self) == -1)
12291 return NULL;
12292 if (PyUnicode_IS_ASCII(self))
12293 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012294 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295}
12296
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297#define LEFTSTRIP 0
12298#define RIGHTSTRIP 1
12299#define BOTHSTRIP 2
12300
12301/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012302static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303
INADA Naoki3ae20562017-01-16 20:41:20 +090012304#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012306/* externally visible for str.strip(unicode) */
12307PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012308_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 void *data;
12311 int kind;
12312 Py_ssize_t i, j, len;
12313 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012314 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12317 return NULL;
12318
12319 kind = PyUnicode_KIND(self);
12320 data = PyUnicode_DATA(self);
12321 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012322 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12324 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012325 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 i = 0;
12328 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012329 while (i < len) {
12330 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12331 if (!BLOOM(sepmask, ch))
12332 break;
12333 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12334 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 i++;
12336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012337 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012338
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 j = len;
12340 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012341 j--;
12342 while (j >= i) {
12343 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12344 if (!BLOOM(sepmask, ch))
12345 break;
12346 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12347 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012349 }
12350
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012352 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353
Victor Stinner7931d9a2011-11-04 00:22:48 +010012354 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355}
12356
12357PyObject*
12358PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12359{
12360 unsigned char *data;
12361 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012362 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363
Victor Stinnerde636f32011-10-01 03:55:54 +020012364 if (PyUnicode_READY(self) == -1)
12365 return NULL;
12366
Victor Stinner684d5fd2012-05-03 02:32:34 +020012367 length = PyUnicode_GET_LENGTH(self);
12368 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012369
Victor Stinner684d5fd2012-05-03 02:32:34 +020012370 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012371 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372
Victor Stinnerde636f32011-10-01 03:55:54 +020012373 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012374 PyErr_SetString(PyExc_IndexError, "string index out of range");
12375 return NULL;
12376 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012377 if (start >= length || end < start)
12378 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012379
Victor Stinner684d5fd2012-05-03 02:32:34 +020012380 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012381 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012382 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012383 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012384 }
12385 else {
12386 kind = PyUnicode_KIND(self);
12387 data = PyUnicode_1BYTE_DATA(self);
12388 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012389 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012390 length);
12391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393
12394static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012395do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 Py_ssize_t len, i, j;
12398
12399 if (PyUnicode_READY(self) == -1)
12400 return NULL;
12401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012403
Victor Stinnercc7af722013-04-09 22:39:24 +020012404 if (PyUnicode_IS_ASCII(self)) {
12405 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12406
12407 i = 0;
12408 if (striptype != RIGHTSTRIP) {
12409 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012410 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012411 if (!_Py_ascii_whitespace[ch])
12412 break;
12413 i++;
12414 }
12415 }
12416
12417 j = len;
12418 if (striptype != LEFTSTRIP) {
12419 j--;
12420 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012421 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012422 if (!_Py_ascii_whitespace[ch])
12423 break;
12424 j--;
12425 }
12426 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427 }
12428 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012429 else {
12430 int kind = PyUnicode_KIND(self);
12431 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432
Victor Stinnercc7af722013-04-09 22:39:24 +020012433 i = 0;
12434 if (striptype != RIGHTSTRIP) {
12435 while (i < len) {
12436 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12437 if (!Py_UNICODE_ISSPACE(ch))
12438 break;
12439 i++;
12440 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012441 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012442
12443 j = len;
12444 if (striptype != LEFTSTRIP) {
12445 j--;
12446 while (j >= i) {
12447 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12448 if (!Py_UNICODE_ISSPACE(ch))
12449 break;
12450 j--;
12451 }
12452 j++;
12453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012454 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
Victor Stinner7931d9a2011-11-04 00:22:48 +010012456 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457}
12458
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459
12460static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012461do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 if (sep != NULL && sep != Py_None) {
12464 if (PyUnicode_Check(sep))
12465 return _PyUnicode_XStrip(self, striptype, sep);
12466 else {
12467 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 "%s arg must be None or str",
12469 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 return NULL;
12471 }
12472 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475}
12476
12477
INADA Naoki3ae20562017-01-16 20:41:20 +090012478/*[clinic input]
12479str.strip as unicode_strip
12480
12481 chars: object = None
12482 /
12483
Victor Stinner0c4a8282017-01-17 02:21:47 +010012484Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012485
12486If chars is given and not None, remove characters in chars instead.
12487[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488
12489static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012490unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012491/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492{
INADA Naoki3ae20562017-01-16 20:41:20 +090012493 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494}
12495
12496
INADA Naoki3ae20562017-01-16 20:41:20 +090012497/*[clinic input]
12498str.lstrip as unicode_lstrip
12499
12500 chars: object = NULL
12501 /
12502
12503Return a copy of the string with leading whitespace removed.
12504
12505If chars is given and not None, remove characters in chars instead.
12506[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507
12508static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012509unicode_lstrip_impl(PyObject *self, PyObject *chars)
12510/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511{
INADA Naoki3ae20562017-01-16 20:41:20 +090012512 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513}
12514
12515
INADA Naoki3ae20562017-01-16 20:41:20 +090012516/*[clinic input]
12517str.rstrip as unicode_rstrip
12518
12519 chars: object = NULL
12520 /
12521
12522Return a copy of the string with trailing whitespace removed.
12523
12524If chars is given and not None, remove characters in chars instead.
12525[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012526
12527static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012528unicode_rstrip_impl(PyObject *self, PyObject *chars)
12529/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530{
INADA Naoki3ae20562017-01-16 20:41:20 +090012531 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012532}
12533
12534
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012536unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012538 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Serhiy Storchaka05997252013-01-26 12:14:02 +020012541 if (len < 1)
12542 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Victor Stinnerc4b49542011-12-11 22:44:26 +010012544 /* no repeat, return original string */
12545 if (len == 1)
12546 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012547
Benjamin Petersonbac79492012-01-14 13:34:47 -050012548 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 return NULL;
12550
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012551 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012552 PyErr_SetString(PyExc_OverflowError,
12553 "repeated string is too long");
12554 return NULL;
12555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012557
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012558 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 if (!u)
12560 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012561 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 if (PyUnicode_GET_LENGTH(str) == 1) {
12564 const int kind = PyUnicode_KIND(str);
12565 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012566 if (kind == PyUnicode_1BYTE_KIND) {
12567 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012568 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012569 }
12570 else if (kind == PyUnicode_2BYTE_KIND) {
12571 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012572 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012573 ucs2[n] = fill_char;
12574 } else {
12575 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12576 assert(kind == PyUnicode_4BYTE_KIND);
12577 for (n = 0; n < len; ++n)
12578 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 }
12581 else {
12582 /* number of characters copied this far */
12583 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012584 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012586 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012590 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 }
12594
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012595 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012596 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597}
12598
Alexander Belopolsky40018472011-02-26 01:02:56 +000012599PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012600PyUnicode_Replace(PyObject *str,
12601 PyObject *substr,
12602 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012603 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012605 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12606 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012608 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609}
12610
INADA Naoki3ae20562017-01-16 20:41:20 +090012611/*[clinic input]
12612str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613
INADA Naoki3ae20562017-01-16 20:41:20 +090012614 old: unicode
12615 new: unicode
12616 count: Py_ssize_t = -1
12617 Maximum number of occurrences to replace.
12618 -1 (the default value) means replace all occurrences.
12619 /
12620
12621Return a copy with all occurrences of substring old replaced by new.
12622
12623If the optional argument count is given, only the first count occurrences are
12624replaced.
12625[clinic start generated code]*/
12626
12627static PyObject *
12628unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12629 Py_ssize_t count)
12630/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012632 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012634 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635}
12636
Alexander Belopolsky40018472011-02-26 01:02:56 +000012637static PyObject *
12638unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012640 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 Py_ssize_t isize;
12642 Py_ssize_t osize, squote, dquote, i, o;
12643 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012644 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012648 return NULL;
12649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 isize = PyUnicode_GET_LENGTH(unicode);
12651 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 /* Compute length of output, quote characters, and
12654 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012655 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 max = 127;
12657 squote = dquote = 0;
12658 ikind = PyUnicode_KIND(unicode);
12659 for (i = 0; i < isize; i++) {
12660 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 case '\'': squote++; break;
12664 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 2;
12667 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 default:
12669 /* Fast-path ASCII */
12670 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 ;
12674 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012677 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012679 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012681 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012683 if (osize > PY_SSIZE_T_MAX - incr) {
12684 PyErr_SetString(PyExc_OverflowError,
12685 "string is too long to generate repr");
12686 return NULL;
12687 }
12688 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 }
12690
12691 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012692 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012694 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 if (dquote)
12696 /* Both squote and dquote present. Use squote,
12697 and escape them */
12698 osize += squote;
12699 else
12700 quote = '"';
12701 }
Victor Stinner55c08782013-04-14 18:45:39 +020012702 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703
12704 repr = PyUnicode_New(osize, max);
12705 if (repr == NULL)
12706 return NULL;
12707 okind = PyUnicode_KIND(repr);
12708 odata = PyUnicode_DATA(repr);
12709
12710 PyUnicode_WRITE(okind, odata, 0, quote);
12711 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012712 if (unchanged) {
12713 _PyUnicode_FastCopyCharacters(repr, 1,
12714 unicode, 0,
12715 isize);
12716 }
12717 else {
12718 for (i = 0, o = 1; i < isize; i++) {
12719 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720
Victor Stinner55c08782013-04-14 18:45:39 +020012721 /* Escape quotes and backslashes */
12722 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012723 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012725 continue;
12726 }
12727
12728 /* Map special whitespace to '\t', \n', '\r' */
12729 if (ch == '\t') {
12730 PyUnicode_WRITE(okind, odata, o++, '\\');
12731 PyUnicode_WRITE(okind, odata, o++, 't');
12732 }
12733 else if (ch == '\n') {
12734 PyUnicode_WRITE(okind, odata, o++, '\\');
12735 PyUnicode_WRITE(okind, odata, o++, 'n');
12736 }
12737 else if (ch == '\r') {
12738 PyUnicode_WRITE(okind, odata, o++, '\\');
12739 PyUnicode_WRITE(okind, odata, o++, 'r');
12740 }
12741
12742 /* Map non-printable US ASCII to '\xhh' */
12743 else if (ch < ' ' || ch == 0x7F) {
12744 PyUnicode_WRITE(okind, odata, o++, '\\');
12745 PyUnicode_WRITE(okind, odata, o++, 'x');
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12748 }
12749
12750 /* Copy ASCII characters as-is */
12751 else if (ch < 0x7F) {
12752 PyUnicode_WRITE(okind, odata, o++, ch);
12753 }
12754
12755 /* Non-ASCII characters */
12756 else {
12757 /* Map Unicode whitespace and control characters
12758 (categories Z* and C* except ASCII space)
12759 */
12760 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12761 PyUnicode_WRITE(okind, odata, o++, '\\');
12762 /* Map 8-bit characters to '\xhh' */
12763 if (ch <= 0xff) {
12764 PyUnicode_WRITE(okind, odata, o++, 'x');
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12767 }
12768 /* Map 16-bit characters to '\uxxxx' */
12769 else if (ch <= 0xffff) {
12770 PyUnicode_WRITE(okind, odata, o++, 'u');
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12775 }
12776 /* Map 21-bit characters to '\U00xxxxxx' */
12777 else {
12778 PyUnicode_WRITE(okind, odata, o++, 'U');
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12783 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12784 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12785 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12786 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12787 }
12788 }
12789 /* Copy characters as-is */
12790 else {
12791 PyUnicode_WRITE(okind, odata, o++, ch);
12792 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012793 }
12794 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012797 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012798 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012801PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803\n\
12804Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012805such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806arguments start and end are interpreted as in slice notation.\n\
12807\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012808Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012813 /* initialize variables to prevent gcc warning */
12814 PyObject *substring = NULL;
12815 Py_ssize_t start = 0;
12816 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012819 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (result == -2)
12828 return NULL;
12829
Christian Heimes217cfd12007-12-02 14:31:20 +000012830 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831}
12832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012833PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012836Return the highest index in S where substring sub is found,\n\
12837such that sub is contained within S[start:end]. Optional\n\
12838arguments start and end are interpreted as in slice notation.\n\
12839\n\
12840Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
12842static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012845 /* initialize variables to prevent gcc warning */
12846 PyObject *substring = NULL;
12847 Py_ssize_t start = 0;
12848 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012849 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012851 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012854 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012857 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 if (result == -2)
12860 return NULL;
12861
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862 if (result < 0) {
12863 PyErr_SetString(PyExc_ValueError, "substring not found");
12864 return NULL;
12865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866
Christian Heimes217cfd12007-12-02 14:31:20 +000012867 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
INADA Naoki3ae20562017-01-16 20:41:20 +090012870/*[clinic input]
12871str.rjust as unicode_rjust
12872
12873 width: Py_ssize_t
12874 fillchar: Py_UCS4 = ' '
12875 /
12876
12877Return a right-justified string of length width.
12878
12879Padding is done using the specified fill character (default is a space).
12880[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
12882static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012883unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12884/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012886 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887 return NULL;
12888
Victor Stinnerc4b49542011-12-11 22:44:26 +010012889 if (PyUnicode_GET_LENGTH(self) >= width)
12890 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891
Victor Stinnerc4b49542011-12-11 22:44:26 +010012892 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893}
12894
Alexander Belopolsky40018472011-02-26 01:02:56 +000012895PyObject *
12896PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012898 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012901 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902}
12903
INADA Naoki3ae20562017-01-16 20:41:20 +090012904/*[clinic input]
12905str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906
INADA Naoki3ae20562017-01-16 20:41:20 +090012907 sep: object = None
12908 The delimiter according which to split the string.
12909 None (the default value) means split according to any whitespace,
12910 and discard empty strings from the result.
12911 maxsplit: Py_ssize_t = -1
12912 Maximum number of splits to do.
12913 -1 (the default value) means no limit.
12914
12915Return a list of the words in the string, using sep as the delimiter string.
12916[clinic start generated code]*/
12917
12918static PyObject *
12919unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12920/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921{
INADA Naoki3ae20562017-01-16 20:41:20 +090012922 if (sep == Py_None)
12923 return split(self, NULL, maxsplit);
12924 if (PyUnicode_Check(sep))
12925 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926
12927 PyErr_Format(PyExc_TypeError,
12928 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012929 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931}
12932
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012934PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012935{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012937 int kind1, kind2;
12938 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012940
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012941 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012943
Victor Stinner14f8f022011-10-05 20:58:25 +020012944 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 len1 = PyUnicode_GET_LENGTH(str_obj);
12947 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 if (kind1 < kind2 || len1 < len2) {
12949 _Py_INCREF_UNICODE_EMPTY();
12950 if (!unicode_empty)
12951 out = NULL;
12952 else {
12953 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12954 Py_DECREF(unicode_empty);
12955 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 return out;
12957 }
12958 buf1 = PyUnicode_DATA(str_obj);
12959 buf2 = PyUnicode_DATA(sep_obj);
12960 if (kind2 != kind1) {
12961 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12962 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012963 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012966 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012968 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12969 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970 else
12971 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 break;
12973 case PyUnicode_2BYTE_KIND:
12974 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12975 break;
12976 case PyUnicode_4BYTE_KIND:
12977 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12978 break;
12979 default:
12980 assert(0);
12981 out = 0;
12982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012984 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986
12987 return out;
12988}
12989
12990
12991PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012992PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012995 int kind1, kind2;
12996 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012999 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013001
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 len1 = PyUnicode_GET_LENGTH(str_obj);
13005 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013006 if (kind1 < kind2 || len1 < len2) {
13007 _Py_INCREF_UNICODE_EMPTY();
13008 if (!unicode_empty)
13009 out = NULL;
13010 else {
13011 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13012 Py_DECREF(unicode_empty);
13013 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013014 return out;
13015 }
13016 buf1 = PyUnicode_DATA(str_obj);
13017 buf2 = PyUnicode_DATA(sep_obj);
13018 if (kind2 != kind1) {
13019 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13020 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013024 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013026 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13027 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028 else
13029 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 break;
13031 case PyUnicode_2BYTE_KIND:
13032 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13033 break;
13034 case PyUnicode_4BYTE_KIND:
13035 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13036 break;
13037 default:
13038 assert(0);
13039 out = 0;
13040 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013042 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044
13045 return out;
13046}
13047
INADA Naoki3ae20562017-01-16 20:41:20 +090013048/*[clinic input]
13049str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013050
INADA Naoki3ae20562017-01-16 20:41:20 +090013051 sep: object
13052 /
13053
13054Partition the string into three parts using the given separator.
13055
13056This will search for the separator in the string. If the separator is found,
13057returns a 3-tuple containing the part before the separator, the separator
13058itself, and the part after it.
13059
13060If the separator is not found, returns a 3-tuple containing the original string
13061and two empty strings.
13062[clinic start generated code]*/
13063
13064static PyObject *
13065unicode_partition(PyObject *self, PyObject *sep)
13066/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067{
INADA Naoki3ae20562017-01-16 20:41:20 +090013068 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069}
13070
INADA Naoki3ae20562017-01-16 20:41:20 +090013071/*[clinic input]
13072str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013073
INADA Naoki3ae20562017-01-16 20:41:20 +090013074Partition the string into three parts using the given separator.
13075
13076This will search for the separator in the string, starting and the end. If
13077the separator is found, returns a 3-tuple containing the part before the
13078separator, the separator itself, and the part after it.
13079
13080If the separator is not found, returns a 3-tuple containing two empty strings
13081and the original string.
13082[clinic start generated code]*/
13083
13084static PyObject *
13085unicode_rpartition(PyObject *self, PyObject *sep)
13086/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013087{
INADA Naoki3ae20562017-01-16 20:41:20 +090013088 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013089}
13090
Alexander Belopolsky40018472011-02-26 01:02:56 +000013091PyObject *
13092PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013093{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013094 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013095 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013097 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013098}
13099
INADA Naoki3ae20562017-01-16 20:41:20 +090013100/*[clinic input]
13101str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013102
INADA Naoki3ae20562017-01-16 20:41:20 +090013103Return a list of the words in the string, using sep as the delimiter string.
13104
13105Splits are done starting at the end of the string and working to the front.
13106[clinic start generated code]*/
13107
13108static PyObject *
13109unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13110/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013111{
INADA Naoki3ae20562017-01-16 20:41:20 +090013112 if (sep == Py_None)
13113 return rsplit(self, NULL, maxsplit);
13114 if (PyUnicode_Check(sep))
13115 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013116
13117 PyErr_Format(PyExc_TypeError,
13118 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013119 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013120 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013121}
13122
INADA Naoki3ae20562017-01-16 20:41:20 +090013123/*[clinic input]
13124str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013126 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013127
13128Return a list of the lines in the string, breaking at line boundaries.
13129
13130Line breaks are not included in the resulting list unless keepends is given and
13131true.
13132[clinic start generated code]*/
13133
13134static PyObject *
13135unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013136/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
13140
13141static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013142PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013144 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145}
13146
INADA Naoki3ae20562017-01-16 20:41:20 +090013147/*[clinic input]
13148str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149
INADA Naoki3ae20562017-01-16 20:41:20 +090013150Convert uppercase characters to lowercase and lowercase characters to uppercase.
13151[clinic start generated code]*/
13152
13153static PyObject *
13154unicode_swapcase_impl(PyObject *self)
13155/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013157 if (PyUnicode_READY(self) == -1)
13158 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013159 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160}
13161
Larry Hastings61272b72014-01-07 12:41:53 -080013162/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013163
Larry Hastings31826802013-10-19 00:09:25 -070013164@staticmethod
13165str.maketrans as unicode_maketrans
13166
13167 x: object
13168
13169 y: unicode=NULL
13170
13171 z: unicode=NULL
13172
13173 /
13174
13175Return a translation table usable for str.translate().
13176
13177If there is only one argument, it must be a dictionary mapping Unicode
13178ordinals (integers) or characters to Unicode ordinals, strings or None.
13179Character keys will be then converted to ordinals.
13180If there are two arguments, they must be strings of equal length, and
13181in the resulting dictionary, each character in x will be mapped to the
13182character at the same position in y. If there is a third argument, it
13183must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013184[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013185
Larry Hastings31826802013-10-19 00:09:25 -070013186static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013187unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013188/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013189{
Georg Brandlceee0772007-11-27 23:48:05 +000013190 PyObject *new = NULL, *key, *value;
13191 Py_ssize_t i = 0;
13192 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193
Georg Brandlceee0772007-11-27 23:48:05 +000013194 new = PyDict_New();
13195 if (!new)
13196 return NULL;
13197 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 int x_kind, y_kind, z_kind;
13199 void *x_data, *y_data, *z_data;
13200
Georg Brandlceee0772007-11-27 23:48:05 +000013201 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013202 if (!PyUnicode_Check(x)) {
13203 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13204 "be a string if there is a second argument");
13205 goto err;
13206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013208 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13209 "arguments must have equal length");
13210 goto err;
13211 }
13212 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 x_kind = PyUnicode_KIND(x);
13214 y_kind = PyUnicode_KIND(y);
13215 x_data = PyUnicode_DATA(x);
13216 y_data = PyUnicode_DATA(y);
13217 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13218 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013219 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013220 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013221 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013222 if (!value) {
13223 Py_DECREF(key);
13224 goto err;
13225 }
Georg Brandlceee0772007-11-27 23:48:05 +000013226 res = PyDict_SetItem(new, key, value);
13227 Py_DECREF(key);
13228 Py_DECREF(value);
13229 if (res < 0)
13230 goto err;
13231 }
13232 /* create entries for deleting chars in z */
13233 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 z_kind = PyUnicode_KIND(z);
13235 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013236 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013238 if (!key)
13239 goto err;
13240 res = PyDict_SetItem(new, key, Py_None);
13241 Py_DECREF(key);
13242 if (res < 0)
13243 goto err;
13244 }
13245 }
13246 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013247 int kind;
13248 void *data;
13249
Georg Brandlceee0772007-11-27 23:48:05 +000013250 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013251 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013252 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13253 "to maketrans it must be a dict");
13254 goto err;
13255 }
13256 /* copy entries into the new dict, converting string keys to int keys */
13257 while (PyDict_Next(x, &i, &key, &value)) {
13258 if (PyUnicode_Check(key)) {
13259 /* convert string keys to integer keys */
13260 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013261 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013262 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13263 "table must be of length 1");
13264 goto err;
13265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 kind = PyUnicode_KIND(key);
13267 data = PyUnicode_DATA(key);
13268 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013269 if (!newkey)
13270 goto err;
13271 res = PyDict_SetItem(new, newkey, value);
13272 Py_DECREF(newkey);
13273 if (res < 0)
13274 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013275 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013276 /* just keep integer keys */
13277 if (PyDict_SetItem(new, key, value) < 0)
13278 goto err;
13279 } else {
13280 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13281 "be strings or integers");
13282 goto err;
13283 }
13284 }
13285 }
13286 return new;
13287 err:
13288 Py_DECREF(new);
13289 return NULL;
13290}
13291
INADA Naoki3ae20562017-01-16 20:41:20 +090013292/*[clinic input]
13293str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294
INADA Naoki3ae20562017-01-16 20:41:20 +090013295 table: object
13296 Translation table, which must be a mapping of Unicode ordinals to
13297 Unicode ordinals, strings, or None.
13298 /
13299
13300Replace each character in the string using the given translation table.
13301
13302The table must implement lookup/indexing via __getitem__, for instance a
13303dictionary or list. If this operation raises LookupError, the character is
13304left untouched. Characters mapped to None are deleted.
13305[clinic start generated code]*/
13306
13307static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013309/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312}
13313
INADA Naoki3ae20562017-01-16 20:41:20 +090013314/*[clinic input]
13315str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
INADA Naoki3ae20562017-01-16 20:41:20 +090013317Return a copy of the string converted to uppercase.
13318[clinic start generated code]*/
13319
13320static PyObject *
13321unicode_upper_impl(PyObject *self)
13322/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013324 if (PyUnicode_READY(self) == -1)
13325 return NULL;
13326 if (PyUnicode_IS_ASCII(self))
13327 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013328 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329}
13330
INADA Naoki3ae20562017-01-16 20:41:20 +090013331/*[clinic input]
13332str.zfill as unicode_zfill
13333
13334 width: Py_ssize_t
13335 /
13336
13337Pad a numeric string with zeros on the left, to fill a field of the given width.
13338
13339The string is never truncated.
13340[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
13342static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013343unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013344/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013346 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013347 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 int kind;
13349 void *data;
13350 Py_UCS4 chr;
13351
Benjamin Petersonbac79492012-01-14 13:34:47 -050013352 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354
Victor Stinnerc4b49542011-12-11 22:44:26 +010013355 if (PyUnicode_GET_LENGTH(self) >= width)
13356 return unicode_result_unchanged(self);
13357
13358 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359
13360 u = pad(self, fill, 0, '0');
13361
Walter Dörwald068325e2002-04-15 13:36:47 +000013362 if (u == NULL)
13363 return NULL;
13364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 kind = PyUnicode_KIND(u);
13366 data = PyUnicode_DATA(u);
13367 chr = PyUnicode_READ(kind, data, fill);
13368
13369 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 PyUnicode_WRITE(kind, data, 0, chr);
13372 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373 }
13374
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013375 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013376 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378
13379#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013380static PyObject *
13381unicode__decimal2ascii(PyObject *self)
13382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013384}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013385#endif
13386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013387PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013390Return True if S starts with the specified prefix, False otherwise.\n\
13391With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392With optional end, stop comparing S at that position.\n\
13393prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
13395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013396unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013400 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013401 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013402 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404
Jesus Ceaac451502011-04-20 17:09:23 +020013405 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407 if (PyTuple_Check(subobj)) {
13408 Py_ssize_t i;
13409 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013410 substring = PyTuple_GET_ITEM(subobj, i);
13411 if (!PyUnicode_Check(substring)) {
13412 PyErr_Format(PyExc_TypeError,
13413 "tuple for startswith must only contain str, "
13414 "not %.100s",
13415 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013416 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013418 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013419 if (result == -1)
13420 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 if (result) {
13422 Py_RETURN_TRUE;
13423 }
13424 }
13425 /* nothing matched */
13426 Py_RETURN_FALSE;
13427 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 if (!PyUnicode_Check(subobj)) {
13429 PyErr_Format(PyExc_TypeError,
13430 "startswith first arg must be str or "
13431 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013433 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013434 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013435 if (result == -1)
13436 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013437 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438}
13439
13440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013441PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013444Return True if S ends with the specified suffix, False otherwise.\n\
13445With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013446With optional end, stop comparing S at that position.\n\
13447suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448
13449static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013450unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013453 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013454 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013455 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013456 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458
Jesus Ceaac451502011-04-20 17:09:23 +020013459 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013461 if (PyTuple_Check(subobj)) {
13462 Py_ssize_t i;
13463 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013464 substring = PyTuple_GET_ITEM(subobj, i);
13465 if (!PyUnicode_Check(substring)) {
13466 PyErr_Format(PyExc_TypeError,
13467 "tuple for endswith must only contain str, "
13468 "not %.100s",
13469 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013472 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013473 if (result == -1)
13474 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 if (result) {
13476 Py_RETURN_TRUE;
13477 }
13478 }
13479 Py_RETURN_FALSE;
13480 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013481 if (!PyUnicode_Check(subobj)) {
13482 PyErr_Format(PyExc_TypeError,
13483 "endswith first arg must be str or "
13484 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013486 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013487 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013488 if (result == -1)
13489 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013490 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013491}
13492
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013493static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013494_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013495{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013496 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13497 writer->data = PyUnicode_DATA(writer->buffer);
13498
13499 if (!writer->readonly) {
13500 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013501 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013502 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013503 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013504 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13505 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13506 writer->kind = PyUnicode_WCHAR_KIND;
13507 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13508
Victor Stinner8f674cc2013-04-17 23:02:17 +020013509 /* Copy-on-write mode: set buffer size to 0 so
13510 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13511 * next write. */
13512 writer->size = 0;
13513 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013514}
13515
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013517_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013518{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013520
13521 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013522 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013523
13524 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13525 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13526 writer->kind = PyUnicode_WCHAR_KIND;
13527 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013528}
13529
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530int
13531_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13532 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013533{
13534 Py_ssize_t newlen;
13535 PyObject *newbuffer;
13536
Victor Stinner2740e462016-09-06 16:58:36 -070013537 assert(maxchar <= MAX_UNICODE);
13538
Victor Stinnerca9381e2015-09-22 00:58:32 +020013539 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013540 assert((maxchar > writer->maxchar && length >= 0)
13541 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 if (length > PY_SSIZE_T_MAX - writer->pos) {
13544 PyErr_NoMemory();
13545 return -1;
13546 }
13547 newlen = writer->pos + length;
13548
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013549 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013550
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013553 if (writer->overallocate
13554 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13555 /* overallocate to limit the number of realloc() */
13556 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013558 if (newlen < writer->min_length)
13559 newlen = writer->min_length;
13560
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561 writer->buffer = PyUnicode_New(newlen, maxchar);
13562 if (writer->buffer == NULL)
13563 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013565 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013566 if (writer->overallocate
13567 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13568 /* overallocate to limit the number of realloc() */
13569 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013571 if (newlen < writer->min_length)
13572 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013574 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013576 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013577 newbuffer = PyUnicode_New(newlen, maxchar);
13578 if (newbuffer == NULL)
13579 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13581 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013582 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013583 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 }
13585 else {
13586 newbuffer = resize_compact(writer->buffer, newlen);
13587 if (newbuffer == NULL)
13588 return -1;
13589 }
13590 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013591 }
13592 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013593 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594 newbuffer = PyUnicode_New(writer->size, maxchar);
13595 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013596 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013597 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13598 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013599 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013600 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013601 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013602 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013603
13604#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013605}
13606
Victor Stinnerca9381e2015-09-22 00:58:32 +020013607int
13608_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13609 enum PyUnicode_Kind kind)
13610{
13611 Py_UCS4 maxchar;
13612
13613 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13614 assert(writer->kind < kind);
13615
13616 switch (kind)
13617 {
13618 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13619 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13620 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13621 default:
13622 assert(0 && "invalid kind");
13623 return -1;
13624 }
13625
13626 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13627}
13628
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013629static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013630_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013631{
Victor Stinner2740e462016-09-06 16:58:36 -070013632 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013633 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13634 return -1;
13635 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13636 writer->pos++;
13637 return 0;
13638}
13639
13640int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013641_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13642{
13643 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13644}
13645
13646int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013647_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13648{
13649 Py_UCS4 maxchar;
13650 Py_ssize_t len;
13651
13652 if (PyUnicode_READY(str) == -1)
13653 return -1;
13654 len = PyUnicode_GET_LENGTH(str);
13655 if (len == 0)
13656 return 0;
13657 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13658 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013659 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013660 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013661 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013662 Py_INCREF(str);
13663 writer->buffer = str;
13664 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013665 writer->pos += len;
13666 return 0;
13667 }
13668 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13669 return -1;
13670 }
13671 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13672 str, 0, len);
13673 writer->pos += len;
13674 return 0;
13675}
13676
Victor Stinnere215d962012-10-06 23:03:36 +020013677int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013678_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13679 Py_ssize_t start, Py_ssize_t end)
13680{
13681 Py_UCS4 maxchar;
13682 Py_ssize_t len;
13683
13684 if (PyUnicode_READY(str) == -1)
13685 return -1;
13686
13687 assert(0 <= start);
13688 assert(end <= PyUnicode_GET_LENGTH(str));
13689 assert(start <= end);
13690
13691 if (end == 0)
13692 return 0;
13693
13694 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13695 return _PyUnicodeWriter_WriteStr(writer, str);
13696
13697 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13698 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13699 else
13700 maxchar = writer->maxchar;
13701 len = end - start;
13702
13703 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13704 return -1;
13705
13706 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13707 str, start, len);
13708 writer->pos += len;
13709 return 0;
13710}
13711
13712int
Victor Stinner4a587072013-11-19 12:54:53 +010013713_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13714 const char *ascii, Py_ssize_t len)
13715{
13716 if (len == -1)
13717 len = strlen(ascii);
13718
13719 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13720
13721 if (writer->buffer == NULL && !writer->overallocate) {
13722 PyObject *str;
13723
13724 str = _PyUnicode_FromASCII(ascii, len);
13725 if (str == NULL)
13726 return -1;
13727
13728 writer->readonly = 1;
13729 writer->buffer = str;
13730 _PyUnicodeWriter_Update(writer);
13731 writer->pos += len;
13732 return 0;
13733 }
13734
13735 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13736 return -1;
13737
13738 switch (writer->kind)
13739 {
13740 case PyUnicode_1BYTE_KIND:
13741 {
13742 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13743 Py_UCS1 *data = writer->data;
13744
Christian Heimesf051e432016-09-13 20:22:02 +020013745 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013746 break;
13747 }
13748 case PyUnicode_2BYTE_KIND:
13749 {
13750 _PyUnicode_CONVERT_BYTES(
13751 Py_UCS1, Py_UCS2,
13752 ascii, ascii + len,
13753 (Py_UCS2 *)writer->data + writer->pos);
13754 break;
13755 }
13756 case PyUnicode_4BYTE_KIND:
13757 {
13758 _PyUnicode_CONVERT_BYTES(
13759 Py_UCS1, Py_UCS4,
13760 ascii, ascii + len,
13761 (Py_UCS4 *)writer->data + writer->pos);
13762 break;
13763 }
13764 default:
13765 assert(0);
13766 }
13767
13768 writer->pos += len;
13769 return 0;
13770}
13771
13772int
13773_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13774 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013775{
13776 Py_UCS4 maxchar;
13777
13778 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13779 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13780 return -1;
13781 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13782 writer->pos += len;
13783 return 0;
13784}
13785
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013787_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013788{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013789 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013790
Victor Stinnerd3f08822012-05-29 12:57:52 +020013791 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013792 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013793 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013794 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013795
13796 str = writer->buffer;
13797 writer->buffer = NULL;
13798
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013799 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013800 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13801 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013803
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013804 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13805 PyObject *str2;
13806 str2 = resize_compact(str, writer->pos);
13807 if (str2 == NULL) {
13808 Py_DECREF(str);
13809 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013810 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013811 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013812 }
13813
Victor Stinner15a0bd32013-07-08 22:29:55 +020013814 assert(_PyUnicode_CheckConsistency(str, 1));
13815 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013816}
13817
Victor Stinnerd3f08822012-05-29 12:57:52 +020013818void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013819_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013820{
13821 Py_CLEAR(writer->buffer);
13822}
13823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013824#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013825
13826PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013828\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013829Return a formatted version of S, using substitutions from args and kwargs.\n\
13830The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013831
Eric Smith27bbca62010-11-04 17:06:58 +000013832PyDoc_STRVAR(format_map__doc__,
13833 "S.format_map(mapping) -> str\n\
13834\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013835Return a formatted version of S, using substitutions from mapping.\n\
13836The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013837
INADA Naoki3ae20562017-01-16 20:41:20 +090013838/*[clinic input]
13839str.__format__ as unicode___format__
13840
13841 format_spec: unicode
13842 /
13843
13844Return a formatted version of the string as described by format_spec.
13845[clinic start generated code]*/
13846
Eric Smith4a7d76d2008-05-30 18:10:19 +000013847static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013848unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013849/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013850{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851 _PyUnicodeWriter writer;
13852 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013853
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854 if (PyUnicode_READY(self) == -1)
13855 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013856 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13858 self, format_spec, 0,
13859 PyUnicode_GET_LENGTH(format_spec));
13860 if (ret == -1) {
13861 _PyUnicodeWriter_Dealloc(&writer);
13862 return NULL;
13863 }
13864 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013865}
13866
INADA Naoki3ae20562017-01-16 20:41:20 +090013867/*[clinic input]
13868str.__sizeof__ as unicode_sizeof
13869
13870Return the size of the string in memory, in bytes.
13871[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013872
13873static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013874unicode_sizeof_impl(PyObject *self)
13875/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 Py_ssize_t size;
13878
13879 /* If it's a compact object, account for base structure +
13880 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013881 if (PyUnicode_IS_COMPACT_ASCII(self))
13882 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13883 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013885 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 else {
13887 /* If it is a two-block object, account for base object, and
13888 for character block if present. */
13889 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013890 if (_PyUnicode_DATA_ANY(self))
13891 size += (PyUnicode_GET_LENGTH(self) + 1) *
13892 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893 }
13894 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013895 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013896 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13897 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13898 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13899 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013900
13901 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013902}
13903
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013904static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013905unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013906{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013907 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013908 if (!copy)
13909 return NULL;
13910 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013911}
13912
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013914 UNICODE_ENCODE_METHODDEF
13915 UNICODE_REPLACE_METHODDEF
13916 UNICODE_SPLIT_METHODDEF
13917 UNICODE_RSPLIT_METHODDEF
13918 UNICODE_JOIN_METHODDEF
13919 UNICODE_CAPITALIZE_METHODDEF
13920 UNICODE_CASEFOLD_METHODDEF
13921 UNICODE_TITLE_METHODDEF
13922 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013923 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013924 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013927 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013928 UNICODE_LJUST_METHODDEF
13929 UNICODE_LOWER_METHODDEF
13930 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013931 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13932 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013933 UNICODE_RJUST_METHODDEF
13934 UNICODE_RSTRIP_METHODDEF
13935 UNICODE_RPARTITION_METHODDEF
13936 UNICODE_SPLITLINES_METHODDEF
13937 UNICODE_STRIP_METHODDEF
13938 UNICODE_SWAPCASE_METHODDEF
13939 UNICODE_TRANSLATE_METHODDEF
13940 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013941 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13942 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013943 UNICODE_ISLOWER_METHODDEF
13944 UNICODE_ISUPPER_METHODDEF
13945 UNICODE_ISTITLE_METHODDEF
13946 UNICODE_ISSPACE_METHODDEF
13947 UNICODE_ISDECIMAL_METHODDEF
13948 UNICODE_ISDIGIT_METHODDEF
13949 UNICODE_ISNUMERIC_METHODDEF
13950 UNICODE_ISALPHA_METHODDEF
13951 UNICODE_ISALNUM_METHODDEF
13952 UNICODE_ISIDENTIFIER_METHODDEF
13953 UNICODE_ISPRINTABLE_METHODDEF
13954 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013955 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013956 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013957 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013958 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013959 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013960#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013961 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013962 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963#endif
13964
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966 {NULL, NULL}
13967};
13968
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013969static PyObject *
13970unicode_mod(PyObject *v, PyObject *w)
13971{
Brian Curtindfc80e32011-08-10 20:28:54 -050013972 if (!PyUnicode_Check(v))
13973 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013975}
13976
13977static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 0, /*nb_add*/
13979 0, /*nb_subtract*/
13980 0, /*nb_multiply*/
13981 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013982};
13983
Guido van Rossumd57fd912000-03-10 22:53:23 +000013984static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 (lenfunc) unicode_length, /* sq_length */
13986 PyUnicode_Concat, /* sq_concat */
13987 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13988 (ssizeargfunc) unicode_getitem, /* sq_item */
13989 0, /* sq_slice */
13990 0, /* sq_ass_item */
13991 0, /* sq_ass_slice */
13992 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993};
13994
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013995static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013998 if (PyUnicode_READY(self) == -1)
13999 return NULL;
14000
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014001 if (PyIndex_Check(item)) {
14002 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014003 if (i == -1 && PyErr_Occurred())
14004 return NULL;
14005 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014006 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014007 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014009 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014010 PyObject *result;
14011 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014012 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014013 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014014
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014015 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014016 return NULL;
14017 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014018 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14019 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014020
14021 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014022 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014023 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014024 slicelength == PyUnicode_GET_LENGTH(self)) {
14025 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014026 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014027 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014028 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014029 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014031 src_kind = PyUnicode_KIND(self);
14032 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014033 if (!PyUnicode_IS_ASCII(self)) {
14034 kind_limit = kind_maxchar_limit(src_kind);
14035 max_char = 0;
14036 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14037 ch = PyUnicode_READ(src_kind, src_data, cur);
14038 if (ch > max_char) {
14039 max_char = ch;
14040 if (max_char >= kind_limit)
14041 break;
14042 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014043 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014044 }
Victor Stinner55c99112011-10-13 01:17:06 +020014045 else
14046 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014047 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014048 if (result == NULL)
14049 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014050 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014051 dest_data = PyUnicode_DATA(result);
14052
14053 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014054 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14055 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014056 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014057 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014058 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014059 } else {
14060 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14061 return NULL;
14062 }
14063}
14064
14065static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 (lenfunc)unicode_length, /* mp_length */
14067 (binaryfunc)unicode_subscript, /* mp_subscript */
14068 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014069};
14070
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072/* Helpers for PyUnicode_Format() */
14073
Victor Stinnera47082312012-10-04 02:19:54 +020014074struct unicode_formatter_t {
14075 PyObject *args;
14076 int args_owned;
14077 Py_ssize_t arglen, argidx;
14078 PyObject *dict;
14079
14080 enum PyUnicode_Kind fmtkind;
14081 Py_ssize_t fmtcnt, fmtpos;
14082 void *fmtdata;
14083 PyObject *fmtstr;
14084
14085 _PyUnicodeWriter writer;
14086};
14087
14088struct unicode_format_arg_t {
14089 Py_UCS4 ch;
14090 int flags;
14091 Py_ssize_t width;
14092 int prec;
14093 int sign;
14094};
14095
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014097unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098{
Victor Stinnera47082312012-10-04 02:19:54 +020014099 Py_ssize_t argidx = ctx->argidx;
14100
14101 if (argidx < ctx->arglen) {
14102 ctx->argidx++;
14103 if (ctx->arglen < 0)
14104 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 else
Victor Stinnera47082312012-10-04 02:19:54 +020014106 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107 }
14108 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014109 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110 return NULL;
14111}
14112
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014113/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014114
Victor Stinnera47082312012-10-04 02:19:54 +020014115/* Format a float into the writer if the writer is not NULL, or into *p_output
14116 otherwise.
14117
14118 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119static int
Victor Stinnera47082312012-10-04 02:19:54 +020014120formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14121 PyObject **p_output,
14122 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014124 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014125 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014126 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014127 int prec;
14128 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014129
Guido van Rossumd57fd912000-03-10 22:53:23 +000014130 x = PyFloat_AsDouble(v);
14131 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014132 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014133
Victor Stinnera47082312012-10-04 02:19:54 +020014134 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014135 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014136 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014137
Victor Stinnera47082312012-10-04 02:19:54 +020014138 if (arg->flags & F_ALT)
14139 dtoa_flags = Py_DTSF_ALT;
14140 else
14141 dtoa_flags = 0;
14142 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014143 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014144 return -1;
14145 len = strlen(p);
14146 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014147 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014148 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014150 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014151 }
14152 else
14153 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014154 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014155 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014156}
14157
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158/* formatlong() emulates the format codes d, u, o, x and X, and
14159 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14160 * Python's regular ints.
14161 * Return value: a new PyUnicodeObject*, or NULL if error.
14162 * The output string is of the form
14163 * "-"? ("0x" | "0X")? digit+
14164 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14165 * set in flags. The case of hex digits will be correct,
14166 * There will be at least prec digits, zero-filled on the left if
14167 * necessary to get that many.
14168 * val object to be converted
14169 * flags bitmask of format flags; only F_ALT is looked at
14170 * prec minimum number of digits; 0-fill on left if needed
14171 * type a character in [duoxX]; u acts the same as d
14172 *
14173 * CAUTION: o, x and X conversions on regular ints can never
14174 * produce a '-' sign, but can for Python's unbounded ints.
14175 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014176PyObject *
14177_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014178{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014179 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 Py_ssize_t i;
14182 int sign; /* 1 if '-', else 0 */
14183 int len; /* number of characters */
14184 Py_ssize_t llen;
14185 int numdigits; /* len == numnondigits + numdigits */
14186 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014187
Victor Stinnerd0880d52012-04-27 23:40:13 +020014188 /* Avoid exceeding SSIZE_T_MAX */
14189 if (prec > INT_MAX-3) {
14190 PyErr_SetString(PyExc_OverflowError,
14191 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014192 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014193 }
14194
14195 assert(PyLong_Check(val));
14196
14197 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014198 default:
14199 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014201 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014202 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014203 /* int and int subclasses should print numerically when a numeric */
14204 /* format code is used (see issue18780) */
14205 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014206 break;
14207 case 'o':
14208 numnondigits = 2;
14209 result = PyNumber_ToBase(val, 8);
14210 break;
14211 case 'x':
14212 case 'X':
14213 numnondigits = 2;
14214 result = PyNumber_ToBase(val, 16);
14215 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 }
14217 if (!result)
14218 return NULL;
14219
14220 assert(unicode_modifiable(result));
14221 assert(PyUnicode_IS_READY(result));
14222 assert(PyUnicode_IS_ASCII(result));
14223
14224 /* To modify the string in-place, there can only be one reference. */
14225 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014226 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014227 PyErr_BadInternalCall();
14228 return NULL;
14229 }
14230 buf = PyUnicode_DATA(result);
14231 llen = PyUnicode_GET_LENGTH(result);
14232 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014233 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014234 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014235 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014236 return NULL;
14237 }
14238 len = (int)llen;
14239 sign = buf[0] == '-';
14240 numnondigits += sign;
14241 numdigits = len - numnondigits;
14242 assert(numdigits > 0);
14243
14244 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014245 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014246 (type == 'o' || type == 'x' || type == 'X'))) {
14247 assert(buf[sign] == '0');
14248 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14249 buf[sign+1] == 'o');
14250 numnondigits -= 2;
14251 buf += 2;
14252 len -= 2;
14253 if (sign)
14254 buf[0] = '-';
14255 assert(len == numnondigits + numdigits);
14256 assert(numdigits > 0);
14257 }
14258
14259 /* Fill with leading zeroes to meet minimum width. */
14260 if (prec > numdigits) {
14261 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14262 numnondigits + prec);
14263 char *b1;
14264 if (!r1) {
14265 Py_DECREF(result);
14266 return NULL;
14267 }
14268 b1 = PyBytes_AS_STRING(r1);
14269 for (i = 0; i < numnondigits; ++i)
14270 *b1++ = *buf++;
14271 for (i = 0; i < prec - numdigits; i++)
14272 *b1++ = '0';
14273 for (i = 0; i < numdigits; i++)
14274 *b1++ = *buf++;
14275 *b1 = '\0';
14276 Py_DECREF(result);
14277 result = r1;
14278 buf = PyBytes_AS_STRING(result);
14279 len = numnondigits + prec;
14280 }
14281
14282 /* Fix up case for hex conversions. */
14283 if (type == 'X') {
14284 /* Need to convert all lower case letters to upper case.
14285 and need to convert 0x to 0X (and -0x to -0X). */
14286 for (i = 0; i < len; i++)
14287 if (buf[i] >= 'a' && buf[i] <= 'x')
14288 buf[i] -= 'a'-'A';
14289 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 if (!PyUnicode_Check(result)
14291 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014292 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014293 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014294 Py_DECREF(result);
14295 result = unicode;
14296 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297 else if (len != PyUnicode_GET_LENGTH(result)) {
14298 if (PyUnicode_Resize(&result, len) < 0)
14299 Py_CLEAR(result);
14300 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014301 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014302}
14303
Ethan Furmandf3ed242014-01-05 06:50:30 -080014304/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014305 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014306 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 * -1 and raise an exception on error */
14308static int
Victor Stinnera47082312012-10-04 02:19:54 +020014309mainformatlong(PyObject *v,
14310 struct unicode_format_arg_t *arg,
14311 PyObject **p_output,
14312 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014313{
14314 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014315 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014316
14317 if (!PyNumber_Check(v))
14318 goto wrongtype;
14319
Ethan Furman9ab74802014-03-21 06:38:46 -070014320 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014321 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014322 if (type == 'o' || type == 'x' || type == 'X') {
14323 iobj = PyNumber_Index(v);
14324 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014325 if (PyErr_ExceptionMatches(PyExc_TypeError))
14326 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014327 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014328 }
14329 }
14330 else {
14331 iobj = PyNumber_Long(v);
14332 if (iobj == NULL ) {
14333 if (PyErr_ExceptionMatches(PyExc_TypeError))
14334 goto wrongtype;
14335 return -1;
14336 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 }
14338 assert(PyLong_Check(iobj));
14339 }
14340 else {
14341 iobj = v;
14342 Py_INCREF(iobj);
14343 }
14344
14345 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014346 && arg->width == -1 && arg->prec == -1
14347 && !(arg->flags & (F_SIGN | F_BLANK))
14348 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014349 {
14350 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014351 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014352 int base;
14353
Victor Stinnera47082312012-10-04 02:19:54 +020014354 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014355 {
14356 default:
14357 assert(0 && "'type' not in [diuoxX]");
14358 case 'd':
14359 case 'i':
14360 case 'u':
14361 base = 10;
14362 break;
14363 case 'o':
14364 base = 8;
14365 break;
14366 case 'x':
14367 case 'X':
14368 base = 16;
14369 break;
14370 }
14371
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014372 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14373 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014374 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014375 }
14376 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014377 return 1;
14378 }
14379
Ethan Furmanb95b5612015-01-23 20:05:18 -080014380 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014381 Py_DECREF(iobj);
14382 if (res == NULL)
14383 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014384 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014385 return 0;
14386
14387wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014388 switch(type)
14389 {
14390 case 'o':
14391 case 'x':
14392 case 'X':
14393 PyErr_Format(PyExc_TypeError,
14394 "%%%c format: an integer is required, "
14395 "not %.200s",
14396 type, Py_TYPE(v)->tp_name);
14397 break;
14398 default:
14399 PyErr_Format(PyExc_TypeError,
14400 "%%%c format: a number is required, "
14401 "not %.200s",
14402 type, Py_TYPE(v)->tp_name);
14403 break;
14404 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014405 return -1;
14406}
14407
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014408static Py_UCS4
14409formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014410{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014411 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014412 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014413 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014414 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014415 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 goto onError;
14417 }
14418 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014419 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014421 /* make sure number is a type of integer */
14422 if (!PyLong_Check(v)) {
14423 iobj = PyNumber_Index(v);
14424 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014425 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014426 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014427 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014428 Py_DECREF(iobj);
14429 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014430 else {
14431 x = PyLong_AsLong(v);
14432 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 if (x == -1 && PyErr_Occurred())
14434 goto onError;
14435
Victor Stinner8faf8212011-12-08 22:14:11 +010014436 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014437 PyErr_SetString(PyExc_OverflowError,
14438 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014439 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014440 }
14441
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014442 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014444
Benjamin Peterson29060642009-01-31 22:14:21 +000014445 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014446 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014447 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014448 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014449}
14450
Victor Stinnera47082312012-10-04 02:19:54 +020014451/* Parse options of an argument: flags, width, precision.
14452 Handle also "%(name)" syntax.
14453
14454 Return 0 if the argument has been formatted into arg->str.
14455 Return 1 if the argument has been written into ctx->writer,
14456 Raise an exception and return -1 on error. */
14457static int
14458unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14459 struct unicode_format_arg_t *arg)
14460{
14461#define FORMAT_READ(ctx) \
14462 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14463
14464 PyObject *v;
14465
Victor Stinnera47082312012-10-04 02:19:54 +020014466 if (arg->ch == '(') {
14467 /* Get argument value from a dictionary. Example: "%(name)s". */
14468 Py_ssize_t keystart;
14469 Py_ssize_t keylen;
14470 PyObject *key;
14471 int pcount = 1;
14472
14473 if (ctx->dict == NULL) {
14474 PyErr_SetString(PyExc_TypeError,
14475 "format requires a mapping");
14476 return -1;
14477 }
14478 ++ctx->fmtpos;
14479 --ctx->fmtcnt;
14480 keystart = ctx->fmtpos;
14481 /* Skip over balanced parentheses */
14482 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14483 arg->ch = FORMAT_READ(ctx);
14484 if (arg->ch == ')')
14485 --pcount;
14486 else if (arg->ch == '(')
14487 ++pcount;
14488 ctx->fmtpos++;
14489 }
14490 keylen = ctx->fmtpos - keystart - 1;
14491 if (ctx->fmtcnt < 0 || pcount > 0) {
14492 PyErr_SetString(PyExc_ValueError,
14493 "incomplete format key");
14494 return -1;
14495 }
14496 key = PyUnicode_Substring(ctx->fmtstr,
14497 keystart, keystart + keylen);
14498 if (key == NULL)
14499 return -1;
14500 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014501 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014502 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014503 }
14504 ctx->args = PyObject_GetItem(ctx->dict, key);
14505 Py_DECREF(key);
14506 if (ctx->args == NULL)
14507 return -1;
14508 ctx->args_owned = 1;
14509 ctx->arglen = -1;
14510 ctx->argidx = -2;
14511 }
14512
14513 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014514 while (--ctx->fmtcnt >= 0) {
14515 arg->ch = FORMAT_READ(ctx);
14516 ctx->fmtpos++;
14517 switch (arg->ch) {
14518 case '-': arg->flags |= F_LJUST; continue;
14519 case '+': arg->flags |= F_SIGN; continue;
14520 case ' ': arg->flags |= F_BLANK; continue;
14521 case '#': arg->flags |= F_ALT; continue;
14522 case '0': arg->flags |= F_ZERO; continue;
14523 }
14524 break;
14525 }
14526
14527 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014528 if (arg->ch == '*') {
14529 v = unicode_format_getnextarg(ctx);
14530 if (v == NULL)
14531 return -1;
14532 if (!PyLong_Check(v)) {
14533 PyErr_SetString(PyExc_TypeError,
14534 "* wants int");
14535 return -1;
14536 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014537 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014538 if (arg->width == -1 && PyErr_Occurred())
14539 return -1;
14540 if (arg->width < 0) {
14541 arg->flags |= F_LJUST;
14542 arg->width = -arg->width;
14543 }
14544 if (--ctx->fmtcnt >= 0) {
14545 arg->ch = FORMAT_READ(ctx);
14546 ctx->fmtpos++;
14547 }
14548 }
14549 else if (arg->ch >= '0' && arg->ch <= '9') {
14550 arg->width = arg->ch - '0';
14551 while (--ctx->fmtcnt >= 0) {
14552 arg->ch = FORMAT_READ(ctx);
14553 ctx->fmtpos++;
14554 if (arg->ch < '0' || arg->ch > '9')
14555 break;
14556 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14557 mixing signed and unsigned comparison. Since arg->ch is between
14558 '0' and '9', casting to int is safe. */
14559 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14560 PyErr_SetString(PyExc_ValueError,
14561 "width too big");
14562 return -1;
14563 }
14564 arg->width = arg->width*10 + (arg->ch - '0');
14565 }
14566 }
14567
14568 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014569 if (arg->ch == '.') {
14570 arg->prec = 0;
14571 if (--ctx->fmtcnt >= 0) {
14572 arg->ch = FORMAT_READ(ctx);
14573 ctx->fmtpos++;
14574 }
14575 if (arg->ch == '*') {
14576 v = unicode_format_getnextarg(ctx);
14577 if (v == NULL)
14578 return -1;
14579 if (!PyLong_Check(v)) {
14580 PyErr_SetString(PyExc_TypeError,
14581 "* wants int");
14582 return -1;
14583 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014584 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014585 if (arg->prec == -1 && PyErr_Occurred())
14586 return -1;
14587 if (arg->prec < 0)
14588 arg->prec = 0;
14589 if (--ctx->fmtcnt >= 0) {
14590 arg->ch = FORMAT_READ(ctx);
14591 ctx->fmtpos++;
14592 }
14593 }
14594 else if (arg->ch >= '0' && arg->ch <= '9') {
14595 arg->prec = arg->ch - '0';
14596 while (--ctx->fmtcnt >= 0) {
14597 arg->ch = FORMAT_READ(ctx);
14598 ctx->fmtpos++;
14599 if (arg->ch < '0' || arg->ch > '9')
14600 break;
14601 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14602 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014603 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014604 return -1;
14605 }
14606 arg->prec = arg->prec*10 + (arg->ch - '0');
14607 }
14608 }
14609 }
14610
14611 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14612 if (ctx->fmtcnt >= 0) {
14613 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14614 if (--ctx->fmtcnt >= 0) {
14615 arg->ch = FORMAT_READ(ctx);
14616 ctx->fmtpos++;
14617 }
14618 }
14619 }
14620 if (ctx->fmtcnt < 0) {
14621 PyErr_SetString(PyExc_ValueError,
14622 "incomplete format");
14623 return -1;
14624 }
14625 return 0;
14626
14627#undef FORMAT_READ
14628}
14629
14630/* Format one argument. Supported conversion specifiers:
14631
14632 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014633 - "i", "d", "u": int or float
14634 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014635 - "e", "E", "f", "F", "g", "G": float
14636 - "c": int or str (1 character)
14637
Victor Stinner8dbd4212012-12-04 09:30:24 +010014638 When possible, the output is written directly into the Unicode writer
14639 (ctx->writer). A string is created when padding is required.
14640
Victor Stinnera47082312012-10-04 02:19:54 +020014641 Return 0 if the argument has been formatted into *p_str,
14642 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014643 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014644static int
14645unicode_format_arg_format(struct unicode_formatter_t *ctx,
14646 struct unicode_format_arg_t *arg,
14647 PyObject **p_str)
14648{
14649 PyObject *v;
14650 _PyUnicodeWriter *writer = &ctx->writer;
14651
14652 if (ctx->fmtcnt == 0)
14653 ctx->writer.overallocate = 0;
14654
Victor Stinnera47082312012-10-04 02:19:54 +020014655 v = unicode_format_getnextarg(ctx);
14656 if (v == NULL)
14657 return -1;
14658
Victor Stinnera47082312012-10-04 02:19:54 +020014659
14660 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014661 case 's':
14662 case 'r':
14663 case 'a':
14664 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14665 /* Fast path */
14666 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14667 return -1;
14668 return 1;
14669 }
14670
14671 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14672 *p_str = v;
14673 Py_INCREF(*p_str);
14674 }
14675 else {
14676 if (arg->ch == 's')
14677 *p_str = PyObject_Str(v);
14678 else if (arg->ch == 'r')
14679 *p_str = PyObject_Repr(v);
14680 else
14681 *p_str = PyObject_ASCII(v);
14682 }
14683 break;
14684
14685 case 'i':
14686 case 'd':
14687 case 'u':
14688 case 'o':
14689 case 'x':
14690 case 'X':
14691 {
14692 int ret = mainformatlong(v, arg, p_str, writer);
14693 if (ret != 0)
14694 return ret;
14695 arg->sign = 1;
14696 break;
14697 }
14698
14699 case 'e':
14700 case 'E':
14701 case 'f':
14702 case 'F':
14703 case 'g':
14704 case 'G':
14705 if (arg->width == -1 && arg->prec == -1
14706 && !(arg->flags & (F_SIGN | F_BLANK)))
14707 {
14708 /* Fast path */
14709 if (formatfloat(v, arg, NULL, writer) == -1)
14710 return -1;
14711 return 1;
14712 }
14713
14714 arg->sign = 1;
14715 if (formatfloat(v, arg, p_str, NULL) == -1)
14716 return -1;
14717 break;
14718
14719 case 'c':
14720 {
14721 Py_UCS4 ch = formatchar(v);
14722 if (ch == (Py_UCS4) -1)
14723 return -1;
14724 if (arg->width == -1 && arg->prec == -1) {
14725 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014726 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014727 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014728 return 1;
14729 }
14730 *p_str = PyUnicode_FromOrdinal(ch);
14731 break;
14732 }
14733
14734 default:
14735 PyErr_Format(PyExc_ValueError,
14736 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014737 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014738 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14739 (int)arg->ch,
14740 ctx->fmtpos - 1);
14741 return -1;
14742 }
14743 if (*p_str == NULL)
14744 return -1;
14745 assert (PyUnicode_Check(*p_str));
14746 return 0;
14747}
14748
14749static int
14750unicode_format_arg_output(struct unicode_formatter_t *ctx,
14751 struct unicode_format_arg_t *arg,
14752 PyObject *str)
14753{
14754 Py_ssize_t len;
14755 enum PyUnicode_Kind kind;
14756 void *pbuf;
14757 Py_ssize_t pindex;
14758 Py_UCS4 signchar;
14759 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014760 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014761 Py_ssize_t sublen;
14762 _PyUnicodeWriter *writer = &ctx->writer;
14763 Py_UCS4 fill;
14764
14765 fill = ' ';
14766 if (arg->sign && arg->flags & F_ZERO)
14767 fill = '0';
14768
14769 if (PyUnicode_READY(str) == -1)
14770 return -1;
14771
14772 len = PyUnicode_GET_LENGTH(str);
14773 if ((arg->width == -1 || arg->width <= len)
14774 && (arg->prec == -1 || arg->prec >= len)
14775 && !(arg->flags & (F_SIGN | F_BLANK)))
14776 {
14777 /* Fast path */
14778 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14779 return -1;
14780 return 0;
14781 }
14782
14783 /* Truncate the string for "s", "r" and "a" formats
14784 if the precision is set */
14785 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14786 if (arg->prec >= 0 && len > arg->prec)
14787 len = arg->prec;
14788 }
14789
14790 /* Adjust sign and width */
14791 kind = PyUnicode_KIND(str);
14792 pbuf = PyUnicode_DATA(str);
14793 pindex = 0;
14794 signchar = '\0';
14795 if (arg->sign) {
14796 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14797 if (ch == '-' || ch == '+') {
14798 signchar = ch;
14799 len--;
14800 pindex++;
14801 }
14802 else if (arg->flags & F_SIGN)
14803 signchar = '+';
14804 else if (arg->flags & F_BLANK)
14805 signchar = ' ';
14806 else
14807 arg->sign = 0;
14808 }
14809 if (arg->width < len)
14810 arg->width = len;
14811
14812 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014813 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014814 if (!(arg->flags & F_LJUST)) {
14815 if (arg->sign) {
14816 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014817 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014818 }
14819 else {
14820 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014821 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014822 }
14823 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014824 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14825 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014826 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014827 }
14828
Victor Stinnera47082312012-10-04 02:19:54 +020014829 buflen = arg->width;
14830 if (arg->sign && len == arg->width)
14831 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014832 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014833 return -1;
14834
14835 /* Write the sign if needed */
14836 if (arg->sign) {
14837 if (fill != ' ') {
14838 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14839 writer->pos += 1;
14840 }
14841 if (arg->width > len)
14842 arg->width--;
14843 }
14844
14845 /* Write the numeric prefix for "x", "X" and "o" formats
14846 if the alternate form is used.
14847 For example, write "0x" for the "%#x" format. */
14848 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14849 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14850 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14851 if (fill != ' ') {
14852 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14853 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14854 writer->pos += 2;
14855 pindex += 2;
14856 }
14857 arg->width -= 2;
14858 if (arg->width < 0)
14859 arg->width = 0;
14860 len -= 2;
14861 }
14862
14863 /* Pad left with the fill character if needed */
14864 if (arg->width > len && !(arg->flags & F_LJUST)) {
14865 sublen = arg->width - len;
14866 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14867 writer->pos += sublen;
14868 arg->width = len;
14869 }
14870
14871 /* If padding with spaces: write sign if needed and/or numeric prefix if
14872 the alternate form is used */
14873 if (fill == ' ') {
14874 if (arg->sign) {
14875 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14876 writer->pos += 1;
14877 }
14878 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14879 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14880 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14881 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14883 writer->pos += 2;
14884 pindex += 2;
14885 }
14886 }
14887
14888 /* Write characters */
14889 if (len) {
14890 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14891 str, pindex, len);
14892 writer->pos += len;
14893 }
14894
14895 /* Pad right with the fill character if needed */
14896 if (arg->width > len) {
14897 sublen = arg->width - len;
14898 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14899 writer->pos += sublen;
14900 }
14901 return 0;
14902}
14903
14904/* Helper of PyUnicode_Format(): format one arg.
14905 Return 0 on success, raise an exception and return -1 on error. */
14906static int
14907unicode_format_arg(struct unicode_formatter_t *ctx)
14908{
14909 struct unicode_format_arg_t arg;
14910 PyObject *str;
14911 int ret;
14912
Victor Stinner8dbd4212012-12-04 09:30:24 +010014913 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014914 if (arg.ch == '%') {
14915 ctx->fmtpos++;
14916 ctx->fmtcnt--;
14917 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14918 return -1;
14919 return 0;
14920 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014921 arg.flags = 0;
14922 arg.width = -1;
14923 arg.prec = -1;
14924 arg.sign = 0;
14925 str = NULL;
14926
Victor Stinnera47082312012-10-04 02:19:54 +020014927 ret = unicode_format_arg_parse(ctx, &arg);
14928 if (ret == -1)
14929 return -1;
14930
14931 ret = unicode_format_arg_format(ctx, &arg, &str);
14932 if (ret == -1)
14933 return -1;
14934
14935 if (ret != 1) {
14936 ret = unicode_format_arg_output(ctx, &arg, str);
14937 Py_DECREF(str);
14938 if (ret == -1)
14939 return -1;
14940 }
14941
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014942 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014943 PyErr_SetString(PyExc_TypeError,
14944 "not all arguments converted during string formatting");
14945 return -1;
14946 }
14947 return 0;
14948}
14949
Alexander Belopolsky40018472011-02-26 01:02:56 +000014950PyObject *
14951PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952{
Victor Stinnera47082312012-10-04 02:19:54 +020014953 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014954
Guido van Rossumd57fd912000-03-10 22:53:23 +000014955 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014956 PyErr_BadInternalCall();
14957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958 }
Victor Stinnera47082312012-10-04 02:19:54 +020014959
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014960 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014961 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014962
14963 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014964 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14965 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14966 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14967 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014968
Victor Stinner8f674cc2013-04-17 23:02:17 +020014969 _PyUnicodeWriter_Init(&ctx.writer);
14970 ctx.writer.min_length = ctx.fmtcnt + 100;
14971 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014972
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014974 ctx.arglen = PyTuple_Size(args);
14975 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014976 }
14977 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014978 ctx.arglen = -1;
14979 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980 }
Victor Stinnera47082312012-10-04 02:19:54 +020014981 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014982 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014983 ctx.dict = args;
14984 else
14985 ctx.dict = NULL;
14986 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987
Victor Stinnera47082312012-10-04 02:19:54 +020014988 while (--ctx.fmtcnt >= 0) {
14989 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014990 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014991
14992 nonfmtpos = ctx.fmtpos++;
14993 while (ctx.fmtcnt >= 0 &&
14994 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14995 ctx.fmtpos++;
14996 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 }
Victor Stinnera47082312012-10-04 02:19:54 +020014998 if (ctx.fmtcnt < 0) {
14999 ctx.fmtpos--;
15000 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015001 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015002
Victor Stinnercfc4c132013-04-03 01:48:39 +020015003 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15004 nonfmtpos, ctx.fmtpos) < 0)
15005 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 }
15007 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015008 ctx.fmtpos++;
15009 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015010 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015011 }
15012 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015013
Victor Stinnera47082312012-10-04 02:19:54 +020015014 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015015 PyErr_SetString(PyExc_TypeError,
15016 "not all arguments converted during string formatting");
15017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015018 }
15019
Victor Stinnera47082312012-10-04 02:19:54 +020015020 if (ctx.args_owned) {
15021 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015022 }
Victor Stinnera47082312012-10-04 02:19:54 +020015023 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015024
Benjamin Peterson29060642009-01-31 22:14:21 +000015025 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015026 _PyUnicodeWriter_Dealloc(&ctx.writer);
15027 if (ctx.args_owned) {
15028 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015029 }
15030 return NULL;
15031}
15032
Jeremy Hylton938ace62002-07-17 16:30:39 +000015033static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015034unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15035
Tim Peters6d6c1a32001-08-02 04:15:00 +000015036static PyObject *
15037unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15038{
Benjamin Peterson29060642009-01-31 22:14:21 +000015039 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015040 static char *kwlist[] = {"object", "encoding", "errors", 0};
15041 char *encoding = NULL;
15042 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015043
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 if (type != &PyUnicode_Type)
15045 return unicode_subtype_new(type, args, kwds);
15046 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015047 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015048 return NULL;
15049 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015050 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 if (encoding == NULL && errors == NULL)
15052 return PyObject_Str(x);
15053 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015054 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015055}
15056
Guido van Rossume023fe02001-08-30 03:12:59 +000015057static PyObject *
15058unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15059{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015060 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061 Py_ssize_t length, char_size;
15062 int share_wstr, share_utf8;
15063 unsigned int kind;
15064 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015065
Benjamin Peterson14339b62009-01-31 16:36:08 +000015066 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015067
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015068 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015071 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015072 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015073 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015074 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015075 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015077 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015078 if (self == NULL) {
15079 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 return NULL;
15081 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015082 kind = PyUnicode_KIND(unicode);
15083 length = PyUnicode_GET_LENGTH(unicode);
15084
15085 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015086#ifdef Py_DEBUG
15087 _PyUnicode_HASH(self) = -1;
15088#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015090#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015091 _PyUnicode_STATE(self).interned = 0;
15092 _PyUnicode_STATE(self).kind = kind;
15093 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015094 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015095 _PyUnicode_STATE(self).ready = 1;
15096 _PyUnicode_WSTR(self) = NULL;
15097 _PyUnicode_UTF8_LENGTH(self) = 0;
15098 _PyUnicode_UTF8(self) = NULL;
15099 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015100 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015101
15102 share_utf8 = 0;
15103 share_wstr = 0;
15104 if (kind == PyUnicode_1BYTE_KIND) {
15105 char_size = 1;
15106 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15107 share_utf8 = 1;
15108 }
15109 else if (kind == PyUnicode_2BYTE_KIND) {
15110 char_size = 2;
15111 if (sizeof(wchar_t) == 2)
15112 share_wstr = 1;
15113 }
15114 else {
15115 assert(kind == PyUnicode_4BYTE_KIND);
15116 char_size = 4;
15117 if (sizeof(wchar_t) == 4)
15118 share_wstr = 1;
15119 }
15120
15121 /* Ensure we won't overflow the length. */
15122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15123 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015124 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015126 data = PyObject_MALLOC((length + 1) * char_size);
15127 if (data == NULL) {
15128 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129 goto onError;
15130 }
15131
Victor Stinnerc3c74152011-10-02 20:39:55 +020015132 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015133 if (share_utf8) {
15134 _PyUnicode_UTF8_LENGTH(self) = length;
15135 _PyUnicode_UTF8(self) = data;
15136 }
15137 if (share_wstr) {
15138 _PyUnicode_WSTR_LENGTH(self) = length;
15139 _PyUnicode_WSTR(self) = (wchar_t *)data;
15140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015141
Christian Heimesf051e432016-09-13 20:22:02 +020015142 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015143 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015144 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015145#ifdef Py_DEBUG
15146 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15147#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015148 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015149 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015150
15151onError:
15152 Py_DECREF(unicode);
15153 Py_DECREF(self);
15154 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015155}
15156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015157PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015158"str(object='') -> str\n\
15159str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015160\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015161Create a new string object from the given object. If encoding or\n\
15162errors is specified, then the object must expose a data buffer\n\
15163that will be decoded using the given encoding and error handler.\n\
15164Otherwise, returns the result of object.__str__() (if defined)\n\
15165or repr(object).\n\
15166encoding defaults to sys.getdefaultencoding().\n\
15167errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015168
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015169static PyObject *unicode_iter(PyObject *seq);
15170
Guido van Rossumd57fd912000-03-10 22:53:23 +000015171PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015172 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 "str", /* tp_name */
15174 sizeof(PyUnicodeObject), /* tp_size */
15175 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015176 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 (destructor)unicode_dealloc, /* tp_dealloc */
15178 0, /* tp_print */
15179 0, /* tp_getattr */
15180 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015181 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 unicode_repr, /* tp_repr */
15183 &unicode_as_number, /* tp_as_number */
15184 &unicode_as_sequence, /* tp_as_sequence */
15185 &unicode_as_mapping, /* tp_as_mapping */
15186 (hashfunc) unicode_hash, /* tp_hash*/
15187 0, /* tp_call*/
15188 (reprfunc) unicode_str, /* tp_str */
15189 PyObject_GenericGetAttr, /* tp_getattro */
15190 0, /* tp_setattro */
15191 0, /* tp_as_buffer */
15192 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015193 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015194 unicode_doc, /* tp_doc */
15195 0, /* tp_traverse */
15196 0, /* tp_clear */
15197 PyUnicode_RichCompare, /* tp_richcompare */
15198 0, /* tp_weaklistoffset */
15199 unicode_iter, /* tp_iter */
15200 0, /* tp_iternext */
15201 unicode_methods, /* tp_methods */
15202 0, /* tp_members */
15203 0, /* tp_getset */
15204 &PyBaseObject_Type, /* tp_base */
15205 0, /* tp_dict */
15206 0, /* tp_descr_get */
15207 0, /* tp_descr_set */
15208 0, /* tp_dictoffset */
15209 0, /* tp_init */
15210 0, /* tp_alloc */
15211 unicode_new, /* tp_new */
15212 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015213};
15214
15215/* Initialize the Unicode implementation */
15216
Victor Stinner3a50e702011-10-18 21:21:00 +020015217int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015218{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015219 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015220 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015221 0x000A, /* LINE FEED */
15222 0x000D, /* CARRIAGE RETURN */
15223 0x001C, /* FILE SEPARATOR */
15224 0x001D, /* GROUP SEPARATOR */
15225 0x001E, /* RECORD SEPARATOR */
15226 0x0085, /* NEXT LINE */
15227 0x2028, /* LINE SEPARATOR */
15228 0x2029, /* PARAGRAPH SEPARATOR */
15229 };
15230
Fred Drakee4315f52000-05-09 19:53:39 +000015231 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015232 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015233 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015234 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015235 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015236
Guido van Rossumcacfc072002-05-24 19:01:59 +000015237 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015238 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015239
15240 /* initialize the linebreak bloom filter */
15241 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015243 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015244
Christian Heimes26532f72013-07-20 14:57:16 +020015245 if (PyType_Ready(&EncodingMapType) < 0)
15246 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015247
Benjamin Petersonc4311282012-10-30 23:21:10 -040015248 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15249 Py_FatalError("Can't initialize field name iterator type");
15250
15251 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15252 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015253
Victor Stinner3a50e702011-10-18 21:21:00 +020015254 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255}
15256
15257/* Finalize the Unicode implementation */
15258
Christian Heimesa156e092008-02-16 07:38:31 +000015259int
15260PyUnicode_ClearFreeList(void)
15261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015262 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015263}
15264
Guido van Rossumd57fd912000-03-10 22:53:23 +000015265void
Thomas Wouters78890102000-07-22 19:25:51 +000015266_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015267{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015268 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015269
Serhiy Storchaka05997252013-01-26 12:14:02 +020015270 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015271
Serhiy Storchaka05997252013-01-26 12:14:02 +020015272 for (i = 0; i < 256; i++)
15273 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015274 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015275 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015276}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015277
Walter Dörwald16807132007-05-25 13:52:07 +000015278void
15279PyUnicode_InternInPlace(PyObject **p)
15280{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015281 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015283#ifdef Py_DEBUG
15284 assert(s != NULL);
15285 assert(_PyUnicode_CHECK(s));
15286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015288 return;
15289#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 /* If it's a subclass, we don't really know what putting
15291 it in the interned dict might do. */
15292 if (!PyUnicode_CheckExact(s))
15293 return;
15294 if (PyUnicode_CHECK_INTERNED(s))
15295 return;
15296 if (interned == NULL) {
15297 interned = PyDict_New();
15298 if (interned == NULL) {
15299 PyErr_Clear(); /* Don't leave an exception */
15300 return;
15301 }
15302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015304 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015306 if (t == NULL) {
15307 PyErr_Clear();
15308 return;
15309 }
15310 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015311 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015312 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015313 return;
15314 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 /* The two references in interned are not counted by refcnt.
15316 The deallocator will take care of this */
15317 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015318 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015319}
15320
15321void
15322PyUnicode_InternImmortal(PyObject **p)
15323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 PyUnicode_InternInPlace(p);
15325 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015326 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 Py_INCREF(*p);
15328 }
Walter Dörwald16807132007-05-25 13:52:07 +000015329}
15330
15331PyObject *
15332PyUnicode_InternFromString(const char *cp)
15333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 PyObject *s = PyUnicode_FromString(cp);
15335 if (s == NULL)
15336 return NULL;
15337 PyUnicode_InternInPlace(&s);
15338 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015339}
15340
Alexander Belopolsky40018472011-02-26 01:02:56 +000015341void
15342_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015343{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015345 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 Py_ssize_t i, n;
15347 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015348
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 if (interned == NULL || !PyDict_Check(interned))
15350 return;
15351 keys = PyDict_Keys(interned);
15352 if (keys == NULL || !PyList_Check(keys)) {
15353 PyErr_Clear();
15354 return;
15355 }
Walter Dörwald16807132007-05-25 13:52:07 +000015356
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15358 detector, interned unicode strings are not forcibly deallocated;
15359 rather, we give them their stolen references back, and then clear
15360 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015361
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 n = PyList_GET_SIZE(keys);
15363 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015364 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015366 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015367 if (PyUnicode_READY(s) == -1) {
15368 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015371 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015372 case SSTATE_NOT_INTERNED:
15373 /* XXX Shouldn't happen */
15374 break;
15375 case SSTATE_INTERNED_IMMORTAL:
15376 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015377 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 break;
15379 case SSTATE_INTERNED_MORTAL:
15380 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015381 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 break;
15383 default:
15384 Py_FatalError("Inconsistent interned string state.");
15385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015386 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 }
15388 fprintf(stderr, "total size of all interned strings: "
15389 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15390 "mortal/immortal\n", mortal_size, immortal_size);
15391 Py_DECREF(keys);
15392 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015393 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015394}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395
15396
15397/********************* Unicode Iterator **************************/
15398
15399typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 PyObject_HEAD
15401 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015402 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403} unicodeiterobject;
15404
15405static void
15406unicodeiter_dealloc(unicodeiterobject *it)
15407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 _PyObject_GC_UNTRACK(it);
15409 Py_XDECREF(it->it_seq);
15410 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015411}
15412
15413static int
15414unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15415{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 Py_VISIT(it->it_seq);
15417 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418}
15419
15420static PyObject *
15421unicodeiter_next(unicodeiterobject *it)
15422{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015423 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424
Benjamin Peterson14339b62009-01-31 16:36:08 +000015425 assert(it != NULL);
15426 seq = it->it_seq;
15427 if (seq == NULL)
15428 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015429 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015431 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15432 int kind = PyUnicode_KIND(seq);
15433 void *data = PyUnicode_DATA(seq);
15434 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15435 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 if (item != NULL)
15437 ++it->it_index;
15438 return item;
15439 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015440
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015442 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015444}
15445
15446static PyObject *
15447unicodeiter_len(unicodeiterobject *it)
15448{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 Py_ssize_t len = 0;
15450 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015451 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015453}
15454
15455PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15456
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457static PyObject *
15458unicodeiter_reduce(unicodeiterobject *it)
15459{
15460 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015461 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015462 it->it_seq, it->it_index);
15463 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015464 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015465 if (u == NULL)
15466 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015467 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015468 }
15469}
15470
15471PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15472
15473static PyObject *
15474unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15475{
15476 Py_ssize_t index = PyLong_AsSsize_t(state);
15477 if (index == -1 && PyErr_Occurred())
15478 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015479 if (it->it_seq != NULL) {
15480 if (index < 0)
15481 index = 0;
15482 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15483 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15484 it->it_index = index;
15485 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015486 Py_RETURN_NONE;
15487}
15488
15489PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15490
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015491static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015492 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015493 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015494 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15495 reduce_doc},
15496 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15497 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015499};
15500
15501PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15503 "str_iterator", /* tp_name */
15504 sizeof(unicodeiterobject), /* tp_basicsize */
15505 0, /* tp_itemsize */
15506 /* methods */
15507 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15508 0, /* tp_print */
15509 0, /* tp_getattr */
15510 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015511 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 0, /* tp_repr */
15513 0, /* tp_as_number */
15514 0, /* tp_as_sequence */
15515 0, /* tp_as_mapping */
15516 0, /* tp_hash */
15517 0, /* tp_call */
15518 0, /* tp_str */
15519 PyObject_GenericGetAttr, /* tp_getattro */
15520 0, /* tp_setattro */
15521 0, /* tp_as_buffer */
15522 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15523 0, /* tp_doc */
15524 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15525 0, /* tp_clear */
15526 0, /* tp_richcompare */
15527 0, /* tp_weaklistoffset */
15528 PyObject_SelfIter, /* tp_iter */
15529 (iternextfunc)unicodeiter_next, /* tp_iternext */
15530 unicodeiter_methods, /* tp_methods */
15531 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015532};
15533
15534static PyObject *
15535unicode_iter(PyObject *seq)
15536{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015538
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 if (!PyUnicode_Check(seq)) {
15540 PyErr_BadInternalCall();
15541 return NULL;
15542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015543 if (PyUnicode_READY(seq) == -1)
15544 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015545 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15546 if (it == NULL)
15547 return NULL;
15548 it->it_index = 0;
15549 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015550 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015551 _PyObject_GC_TRACK(it);
15552 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015553}
15554
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015555
15556size_t
15557Py_UNICODE_strlen(const Py_UNICODE *u)
15558{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015559 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015560}
15561
15562Py_UNICODE*
15563Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15564{
15565 Py_UNICODE *u = s1;
15566 while ((*u++ = *s2++));
15567 return s1;
15568}
15569
15570Py_UNICODE*
15571Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15572{
15573 Py_UNICODE *u = s1;
15574 while ((*u++ = *s2++))
15575 if (n-- == 0)
15576 break;
15577 return s1;
15578}
15579
15580Py_UNICODE*
15581Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15582{
15583 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015584 u1 += wcslen(u1);
15585 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015586 return s1;
15587}
15588
15589int
15590Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15591{
15592 while (*s1 && *s2 && *s1 == *s2)
15593 s1++, s2++;
15594 if (*s1 && *s2)
15595 return (*s1 < *s2) ? -1 : +1;
15596 if (*s1)
15597 return 1;
15598 if (*s2)
15599 return -1;
15600 return 0;
15601}
15602
15603int
15604Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15605{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015606 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015607 for (; n != 0; n--) {
15608 u1 = *s1;
15609 u2 = *s2;
15610 if (u1 != u2)
15611 return (u1 < u2) ? -1 : +1;
15612 if (u1 == '\0')
15613 return 0;
15614 s1++;
15615 s2++;
15616 }
15617 return 0;
15618}
15619
15620Py_UNICODE*
15621Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15622{
15623 const Py_UNICODE *p;
15624 for (p = s; *p; p++)
15625 if (*p == c)
15626 return (Py_UNICODE*)p;
15627 return NULL;
15628}
15629
15630Py_UNICODE*
15631Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15632{
15633 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015634 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015635 while (p != s) {
15636 p--;
15637 if (*p == c)
15638 return (Py_UNICODE*)p;
15639 }
15640 return NULL;
15641}
Victor Stinner331ea922010-08-10 16:37:20 +000015642
Victor Stinner71133ff2010-09-01 23:43:53 +000015643Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015644PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015645{
Victor Stinner577db2c2011-10-11 22:12:48 +020015646 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015647 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015649 if (!PyUnicode_Check(unicode)) {
15650 PyErr_BadArgument();
15651 return NULL;
15652 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015653 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015654 if (u == NULL)
15655 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015656 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015657 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015658 PyErr_NoMemory();
15659 return NULL;
15660 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015661 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015662 size *= sizeof(Py_UNICODE);
15663 copy = PyMem_Malloc(size);
15664 if (copy == NULL) {
15665 PyErr_NoMemory();
15666 return NULL;
15667 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015668 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015669 return copy;
15670}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015671
Georg Brandl66c221e2010-10-14 07:04:07 +000015672/* A _string module, to export formatter_parser and formatter_field_name_split
15673 to the string.Formatter class implemented in Python. */
15674
15675static PyMethodDef _string_methods[] = {
15676 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15677 METH_O, PyDoc_STR("split the argument as a field name")},
15678 {"formatter_parser", (PyCFunction) formatter_parser,
15679 METH_O, PyDoc_STR("parse the argument as a format string")},
15680 {NULL, NULL}
15681};
15682
15683static struct PyModuleDef _string_module = {
15684 PyModuleDef_HEAD_INIT,
15685 "_string",
15686 PyDoc_STR("string helper module"),
15687 0,
15688 _string_methods,
15689 NULL,
15690 NULL,
15691 NULL,
15692 NULL
15693};
15694
15695PyMODINIT_FUNC
15696PyInit__string(void)
15697{
15698 return PyModule_Create(&_string_module);
15699}
15700
15701
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015702#ifdef __cplusplus
15703}
15704#endif