blob: 646de0e917a0aefc09d8fee13336a0187e138d61 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002956/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957
Victor Stinnerd88d9832011-09-06 02:00:05 +02002958 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 character) required to convert the unicode object. Ignore size argument.
2960
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002963 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002964Py_ssize_t
2965PyUnicode_AsWideChar(PyObject *unicode,
2966 wchar_t *w,
2967 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002968{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002969 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 const wchar_t *wstr;
2971
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002972 if (unicode == NULL) {
2973 PyErr_BadInternalCall();
2974 return -1;
2975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002976 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 if (wstr == NULL)
2978 return -1;
2979
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002981 if (size > res)
2982 size = res + 1;
2983 else
2984 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002985 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002986 return res;
2987 }
2988 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002990}
2991
Victor Stinner137c34c2010-09-29 10:25:54 +00002992wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002993PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 Py_ssize_t *size)
2995{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002996 const wchar_t *wstr;
2997 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002998 Py_ssize_t buflen;
2999
3000 if (unicode == NULL) {
3001 PyErr_BadInternalCall();
3002 return NULL;
3003 }
3004
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003005 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3006 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003008 }
3009 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3010 PyErr_SetString(PyExc_ValueError,
3011 "embedded null character");
3012 return NULL;
3013 }
3014
3015 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003016 if (buffer == NULL) {
3017 PyErr_NoMemory();
3018 return NULL;
3019 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003020 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003021 if (size != NULL)
3022 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003023 return buffer;
3024}
3025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003026#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003030{
Victor Stinner8faf8212011-12-08 22:14:11 +01003031 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyErr_SetString(PyExc_ValueError,
3033 "chr() arg not in range(0x110000)");
3034 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003036
Victor Stinner985a82a2014-01-03 12:53:47 +01003037 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003041PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003046 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003047 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 Py_INCREF(obj);
3049 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 }
3051 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 /* For a Unicode subtype that's not a Unicode object,
3053 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003054 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003056 PyErr_Format(PyExc_TypeError,
3057 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003058 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003059 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003063PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003066{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003067 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 PyErr_BadInternalCall();
3072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003074
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 /* Decoding bytes objects is the most common case and should be fast */
3076 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003077 if (PyBytes_GET_SIZE(obj) == 0)
3078 _Py_RETURN_UNICODE_EMPTY();
3079 v = PyUnicode_Decode(
3080 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3081 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 return v;
3083 }
3084
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003085 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 PyErr_SetString(PyExc_TypeError,
3087 "decoding str is not supported");
3088 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003091 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3092 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3093 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003094 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003095 Py_TYPE(obj)->tp_name);
3096 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003097 }
Tim Petersced69f82003-09-16 20:30:58 +00003098
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003100 PyBuffer_Release(&buffer);
3101 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003103
Serhiy Storchaka05997252013-01-26 12:14:02 +02003104 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003105 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003106 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107}
3108
Victor Stinnerebe17e02016-10-12 13:57:45 +02003109/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3110 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3111 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003112int
3113_Py_normalize_encoding(const char *encoding,
3114 char *lower,
3115 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003117 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118 char *l;
3119 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003120 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003121
Victor Stinner942889a2016-09-05 15:40:10 -07003122 assert(encoding != NULL);
3123
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003124 e = encoding;
3125 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003126 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003127 punct = 0;
3128 while (1) {
3129 char c = *e;
3130 if (c == 0) {
3131 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003132 }
Victor Stinner942889a2016-09-05 15:40:10 -07003133
3134 if (Py_ISALNUM(c) || c == '.') {
3135 if (punct && l != lower) {
3136 if (l == l_end) {
3137 return 0;
3138 }
3139 *l++ = '_';
3140 }
3141 punct = 0;
3142
3143 if (l == l_end) {
3144 return 0;
3145 }
3146 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003149 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003150 }
Victor Stinner942889a2016-09-05 15:40:10 -07003151
3152 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003153 }
3154 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003155 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003156}
3157
Alexander Belopolsky40018472011-02-26 01:02:56 +00003158PyObject *
3159PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003160 Py_ssize_t size,
3161 const char *encoding,
3162 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003163{
3164 PyObject *buffer = NULL, *unicode;
3165 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003166 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3167
3168 if (encoding == NULL) {
3169 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3170 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003171
Fred Drakee4315f52000-05-09 19:53:39 +00003172 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003173 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3174 char *lower = buflower;
3175
3176 /* Fast paths */
3177 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3178 lower += 3;
3179 if (*lower == '_') {
3180 /* Match "utf8" and "utf_8" */
3181 lower++;
3182 }
3183
3184 if (lower[0] == '8' && lower[1] == 0) {
3185 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3186 }
3187 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3188 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3189 }
3190 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3191 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3192 }
3193 }
3194 else {
3195 if (strcmp(lower, "ascii") == 0
3196 || strcmp(lower, "us_ascii") == 0) {
3197 return PyUnicode_DecodeASCII(s, size, errors);
3198 }
Steve Dowercc16be82016-09-08 10:35:16 -07003199 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003200 else if (strcmp(lower, "mbcs") == 0) {
3201 return PyUnicode_DecodeMBCS(s, size, errors);
3202 }
3203 #endif
3204 else if (strcmp(lower, "latin1") == 0
3205 || strcmp(lower, "latin_1") == 0
3206 || strcmp(lower, "iso_8859_1") == 0
3207 || strcmp(lower, "iso8859_1") == 0) {
3208 return PyUnicode_DecodeLatin1(s, size, errors);
3209 }
3210 }
Victor Stinner37296e82010-06-10 13:36:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
3213 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003214 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003215 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003216 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003217 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 if (buffer == NULL)
3219 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003220 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 if (unicode == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003228 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 Py_DECREF(unicode);
3230 goto onError;
3231 }
3232 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003233 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003234
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 Py_XDECREF(buffer);
3237 return NULL;
3238}
3239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240PyObject *
3241PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003242 const char *encoding,
3243 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003244{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245 if (!PyUnicode_Check(unicode)) {
3246 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003247 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248 }
3249
Serhiy Storchaka00939072016-10-27 21:05:49 +03003250 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3251 "PyUnicode_AsDecodedObject() is deprecated; "
3252 "use PyCodec_Decode() to decode from str", 1) < 0)
3253 return NULL;
3254
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257
3258 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003259 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260}
3261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003264 const char *encoding,
3265 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003266{
3267 PyObject *v;
3268
3269 if (!PyUnicode_Check(unicode)) {
3270 PyErr_BadArgument();
3271 goto onError;
3272 }
3273
Serhiy Storchaka00939072016-10-27 21:05:49 +03003274 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3275 "PyUnicode_AsDecodedUnicode() is deprecated; "
3276 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3277 return NULL;
3278
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281
3282 /* Decode via the codec registry */
3283 v = PyCodec_Decode(unicode, encoding, errors);
3284 if (v == NULL)
3285 goto onError;
3286 if (!PyUnicode_Check(v)) {
3287 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003288 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3289 "use codecs.decode() to decode to arbitrary types",
3290 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003291 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 Py_DECREF(v);
3293 goto onError;
3294 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003295 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003298 return NULL;
3299}
3300
Alexander Belopolsky40018472011-02-26 01:02:56 +00003301PyObject *
3302PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003303 Py_ssize_t size,
3304 const char *encoding,
3305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003309 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3313 Py_DECREF(unicode);
3314 return v;
3315}
3316
Alexander Belopolsky40018472011-02-26 01:02:56 +00003317PyObject *
3318PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003319 const char *encoding,
3320 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003321{
3322 PyObject *v;
3323
3324 if (!PyUnicode_Check(unicode)) {
3325 PyErr_BadArgument();
3326 goto onError;
3327 }
3328
Serhiy Storchaka00939072016-10-27 21:05:49 +03003329 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3330 "PyUnicode_AsEncodedObject() is deprecated; "
3331 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3332 "or PyCodec_Encode() for generic encoding", 1) < 0)
3333 return NULL;
3334
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003337
3338 /* Encode via the codec registry */
3339 v = PyCodec_Encode(unicode, encoding, errors);
3340 if (v == NULL)
3341 goto onError;
3342 return v;
3343
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 return NULL;
3346}
3347
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003348static size_t
3349wcstombs_errorpos(const wchar_t *wstr)
3350{
3351 size_t len;
3352#if SIZEOF_WCHAR_T == 2
3353 wchar_t buf[3];
3354#else
3355 wchar_t buf[2];
3356#endif
3357 char outbuf[MB_LEN_MAX];
3358 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003359
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003360#if SIZEOF_WCHAR_T == 2
3361 buf[2] = 0;
3362#else
3363 buf[1] = 0;
3364#endif
3365 start = wstr;
3366 while (*wstr != L'\0')
3367 {
3368 previous = wstr;
3369#if SIZEOF_WCHAR_T == 2
3370 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3371 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3372 {
3373 buf[0] = wstr[0];
3374 buf[1] = wstr[1];
3375 wstr += 2;
3376 }
3377 else {
3378 buf[0] = *wstr;
3379 buf[1] = 0;
3380 wstr++;
3381 }
3382#else
3383 buf[0] = *wstr;
3384 wstr++;
3385#endif
3386 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003387 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003388 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003389 }
3390
3391 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003392 return 0;
3393}
3394
Victor Stinner1b579672011-12-17 05:47:23 +01003395static int
3396locale_error_handler(const char *errors, int *surrogateescape)
3397{
Victor Stinner50149202015-09-22 00:26:54 +02003398 _Py_error_handler error_handler = get_error_handler(errors);
3399 switch (error_handler)
3400 {
3401 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003402 *surrogateescape = 0;
3403 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003404 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003405 *surrogateescape = 1;
3406 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003407 default:
3408 PyErr_Format(PyExc_ValueError,
3409 "only 'strict' and 'surrogateescape' error handlers "
3410 "are supported, not '%s'",
3411 errors);
3412 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003413 }
Victor Stinner1b579672011-12-17 05:47:23 +01003414}
3415
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003417PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418{
3419 Py_ssize_t wlen, wlen2;
3420 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003422 PyObject *bytes, *reason, *exc;
3423 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003424 int surrogateescape;
3425
3426 if (locale_error_handler(errors, &surrogateescape) < 0)
3427 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428
3429 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3430 if (wstr == NULL)
3431 return NULL;
3432
3433 wlen2 = wcslen(wstr);
3434 if (wlen2 != wlen) {
3435 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003436 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003437 return NULL;
3438 }
3439
3440 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003441 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 char *str;
3443
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003444 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445 if (str == NULL) {
3446 if (error_pos == (size_t)-1) {
3447 PyErr_NoMemory();
3448 PyMem_Free(wstr);
3449 return NULL;
3450 }
3451 else {
3452 goto encode_error;
3453 }
3454 }
3455 PyMem_Free(wstr);
3456
3457 bytes = PyBytes_FromString(str);
3458 PyMem_Free(str);
3459 }
3460 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003461 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 size_t len, len2;
3463
3464 len = wcstombs(NULL, wstr, 0);
3465 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003466 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 goto encode_error;
3468 }
3469
3470 bytes = PyBytes_FromStringAndSize(NULL, len);
3471 if (bytes == NULL) {
3472 PyMem_Free(wstr);
3473 return NULL;
3474 }
3475
3476 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3477 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003478 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003479 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003480 goto encode_error;
3481 }
3482 PyMem_Free(wstr);
3483 }
3484 return bytes;
3485
3486encode_error:
3487 errmsg = strerror(errno);
3488 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003489
3490 if (error_pos == (size_t)-1)
3491 error_pos = wcstombs_errorpos(wstr);
3492
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003494
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003495 wstr = Py_DecodeLocale(errmsg, &errlen);
3496 if (wstr != NULL) {
3497 reason = PyUnicode_FromWideChar(wstr, errlen);
3498 PyMem_RawFree(wstr);
3499 } else {
3500 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003501 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003502
Victor Stinner2f197072011-12-17 07:08:30 +01003503 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003504 reason = PyUnicode_FromString(
3505 "wcstombs() encountered an unencodable "
3506 "wide character");
3507 if (reason == NULL)
3508 return NULL;
3509
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 Py_DECREF(reason);
3516 if (exc != NULL) {
3517 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003518 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003519 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003520 return NULL;
3521}
3522
Victor Stinnerad158722010-10-27 00:25:46 +00003523PyObject *
3524PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525{
Steve Dowercc16be82016-09-08 10:35:16 -07003526#if defined(__APPLE__)
3527 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003528#else
Victor Stinner793b5312011-04-27 00:24:21 +02003529 PyInterpreterState *interp = PyThreadState_GET()->interp;
3530 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3531 cannot use it to encode and decode filenames before it is loaded. Load
3532 the Python codec requires to encode at least its own filename. Use the C
3533 version of the locale codec until the codec registry is initialized and
3534 the Python codec is loaded.
3535
3536 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3537 cannot only rely on it: check also interp->fscodec_initialized for
3538 subinterpreters. */
3539 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003540 return PyUnicode_AsEncodedString(unicode,
3541 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003542 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
3544 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003545 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003546 }
Victor Stinnerad158722010-10-27 00:25:46 +00003547#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003548}
3549
Alexander Belopolsky40018472011-02-26 01:02:56 +00003550PyObject *
3551PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003552 const char *encoding,
3553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003556 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 if (!PyUnicode_Check(unicode)) {
3559 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 }
Fred Drakee4315f52000-05-09 19:53:39 +00003562
Victor Stinner942889a2016-09-05 15:40:10 -07003563 if (encoding == NULL) {
3564 return _PyUnicode_AsUTF8String(unicode, errors);
3565 }
3566
Fred Drakee4315f52000-05-09 19:53:39 +00003567 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569 char *lower = buflower;
3570
3571 /* Fast paths */
3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573 lower += 3;
3574 if (*lower == '_') {
3575 /* Match "utf8" and "utf_8" */
3576 lower++;
3577 }
3578
3579 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003580 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003581 }
3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3584 }
3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3587 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003588 }
Victor Stinner942889a2016-09-05 15:40:10 -07003589 else {
3590 if (strcmp(lower, "ascii") == 0
3591 || strcmp(lower, "us_ascii") == 0) {
3592 return _PyUnicode_AsASCIIString(unicode, errors);
3593 }
Steve Dowercc16be82016-09-08 10:35:16 -07003594#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003595 else if (strcmp(lower, "mbcs") == 0) {
3596 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3597 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003598#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003599 else if (strcmp(lower, "latin1") == 0 ||
3600 strcmp(lower, "latin_1") == 0 ||
3601 strcmp(lower, "iso_8859_1") == 0 ||
3602 strcmp(lower, "iso8859_1") == 0) {
3603 return _PyUnicode_AsLatin1String(unicode, errors);
3604 }
3605 }
Victor Stinner37296e82010-06-10 13:36:23 +00003606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607
3608 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003609 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003611 return NULL;
3612
3613 /* The normal path */
3614 if (PyBytes_Check(v))
3615 return v;
3616
3617 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003618 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003619 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003620 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003621
3622 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003623 "encoder %s returned bytearray instead of bytes; "
3624 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003625 encoding);
3626 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003627 Py_DECREF(v);
3628 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003631 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3632 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003633 Py_DECREF(v);
3634 return b;
3635 }
3636
3637 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003638 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3639 "use codecs.encode() to encode to arbitrary types",
3640 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003641 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003642 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003643 return NULL;
3644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003650{
3651 PyObject *v;
3652
3653 if (!PyUnicode_Check(unicode)) {
3654 PyErr_BadArgument();
3655 goto onError;
3656 }
3657
Serhiy Storchaka00939072016-10-27 21:05:49 +03003658 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3659 "PyUnicode_AsEncodedUnicode() is deprecated; "
3660 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3661 return NULL;
3662
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665
3666 /* Encode via the codec registry */
3667 v = PyCodec_Encode(unicode, encoding, errors);
3668 if (v == NULL)
3669 goto onError;
3670 if (!PyUnicode_Check(v)) {
3671 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003672 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3673 "use codecs.encode() to encode to arbitrary types",
3674 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003675 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003676 Py_DECREF(v);
3677 goto onError;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003680
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 return NULL;
3683}
3684
Victor Stinner2f197072011-12-17 07:08:30 +01003685static size_t
3686mbstowcs_errorpos(const char *str, size_t len)
3687{
3688#ifdef HAVE_MBRTOWC
3689 const char *start = str;
3690 mbstate_t mbs;
3691 size_t converted;
3692 wchar_t ch;
3693
3694 memset(&mbs, 0, sizeof mbs);
3695 while (len)
3696 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003697 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003698 if (converted == 0)
3699 /* Reached end of string */
3700 break;
3701 if (converted == (size_t)-1 || converted == (size_t)-2) {
3702 /* Conversion error or incomplete character */
3703 return str - start;
3704 }
3705 else {
3706 str += converted;
3707 len -= converted;
3708 }
3709 }
3710 /* failed to find the undecodable byte sequence */
3711 return 0;
3712#endif
3713 return 0;
3714}
3715
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003718 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
3720 wchar_t smallbuf[256];
3721 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3722 wchar_t *wstr;
3723 size_t wlen, wlen2;
3724 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003725 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003726 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003727 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003728 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003729
3730 if (locale_error_handler(errors, &surrogateescape) < 0)
3731 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003733 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3734 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003735 return NULL;
3736 }
3737
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003738 if (surrogateescape) {
3739 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003740 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wstr == NULL) {
3742 if (wlen == (size_t)-1)
3743 PyErr_NoMemory();
3744 else
3745 PyErr_SetFromErrno(PyExc_OSError);
3746 return NULL;
3747 }
3748
3749 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003750 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 }
3752 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003753 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003754#ifndef HAVE_BROKEN_MBSTOWCS
3755 wlen = mbstowcs(NULL, str, 0);
3756#else
3757 wlen = len;
3758#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003759 if (wlen == (size_t)-1)
3760 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003761 if (wlen+1 <= smallbuf_len) {
3762 wstr = smallbuf;
3763 }
3764 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003765 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003766 if (!wstr)
3767 return PyErr_NoMemory();
3768 }
3769
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 wlen2 = mbstowcs(wstr, str, wlen+1);
3771 if (wlen2 == (size_t)-1) {
3772 if (wstr != smallbuf)
3773 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003775 }
3776#ifdef HAVE_BROKEN_MBSTOWCS
3777 assert(wlen2 == wlen);
3778#endif
3779 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3780 if (wstr != smallbuf)
3781 PyMem_Free(wstr);
3782 }
3783 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003784
3785decode_error:
3786 errmsg = strerror(errno);
3787 assert(errmsg != NULL);
3788
3789 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003790 wstr = Py_DecodeLocale(errmsg, &errlen);
3791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
3793 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003794 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003795
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003810 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003846 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 }
3850 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 }
Victor Stinnerad158722010-10-27 00:25:46 +00003853#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854}
3855
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
Brett Cannonec6ce872016-09-06 15:50:29 -07003860 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003866 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003867 return 1;
3868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003871 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
Victor Stinner0ea2a462010-04-30 00:22:08 +00003885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003887 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894}
3895
3896
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
Brett Cannona5711202016-09-06 19:36:01 -07003900 int is_buffer = 0;
3901 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003905 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 return 1;
3907 }
Brett Cannona5711202016-09-06 19:36:01 -07003908
3909 is_buffer = PyObject_CheckBuffer(arg);
3910 if (!is_buffer) {
3911 path = PyOS_FSPath(arg);
3912 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003913 return 0;
3914 }
Brett Cannona5711202016-09-06 19:36:01 -07003915 }
3916 else {
3917 path = arg;
3918 Py_INCREF(arg);
3919 }
3920
3921 if (PyUnicode_Check(path)) {
3922 if (PyUnicode_READY(path) == -1) {
3923 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003925 }
3926 output = path;
3927 }
3928 else if (PyBytes_Check(path) || is_buffer) {
3929 PyObject *path_bytes = NULL;
3930
3931 if (!PyBytes_Check(path) &&
3932 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3933 "path should be string, bytes, or os.PathLike, not %.200s",
3934 Py_TYPE(arg)->tp_name)) {
3935 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003937 }
3938 path_bytes = PyBytes_FromObject(path);
3939 Py_DECREF(path);
3940 if (!path_bytes) {
3941 return 0;
3942 }
3943 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3944 PyBytes_GET_SIZE(path_bytes));
3945 Py_DECREF(path_bytes);
3946 if (!output) {
3947 return 0;
3948 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003950 else {
3951 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003952 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003954 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 return 0;
3956 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003957 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003958 Py_DECREF(output);
3959 return 0;
3960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003962 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003963 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003964 Py_DECREF(output);
3965 return 0;
3966 }
3967 *(PyObject**)addr = output;
3968 return Py_CLEANUP_SUPPORTED;
3969}
3970
3971
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003972const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003974{
Christian Heimesf3863112007-11-22 07:46:41 +00003975 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003977 if (!PyUnicode_Check(unicode)) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003981 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003984 if (PyUnicode_UTF8(unicode) == NULL) {
3985 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003986 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (bytes == NULL)
3988 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3990 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003991 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_DECREF(bytes);
3993 return NULL;
3994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003996 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 PyBytes_AS_STRING(bytes),
3998 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 Py_DECREF(bytes);
4000 }
4001
4002 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004003 *psize = PyUnicode_UTF8_LENGTH(unicode);
4004 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004005}
4006
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004007const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4011}
4012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013Py_UNICODE *
4014PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 const unsigned char *one_byte;
4017#if SIZEOF_WCHAR_T == 4
4018 const Py_UCS2 *two_bytes;
4019#else
4020 const Py_UCS4 *four_bytes;
4021 const Py_UCS4 *ucs4_end;
4022 Py_ssize_t num_surrogates;
4023#endif
4024 wchar_t *w;
4025 wchar_t *wchar_end;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return NULL;
4030 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 assert(_PyUnicode_KIND(unicode) != 0);
4034 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4039 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 num_surrogates = 0;
4041
4042 for (; four_bytes < ucs4_end; ++four_bytes) {
4043 if (*four_bytes > 0xFFFF)
4044 ++num_surrogates;
4045 }
4046
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4048 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4049 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyErr_NoMemory();
4051 return NULL;
4052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 w = _PyUnicode_WSTR(unicode);
4056 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4057 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4059 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004060 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004062 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4063 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 }
4065 else
4066 *w = *four_bytes;
4067
4068 if (w > wchar_end) {
4069 assert(0 && "Miscalculated string end");
4070 }
4071 }
4072 *w = 0;
4073#else
4074 /* sizeof(wchar_t) == 4 */
4075 Py_FatalError("Impossible unicode object state, wstr and str "
4076 "should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004081 if ((size_t)_PyUnicode_LENGTH(unicode) >
4082 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4083 PyErr_NoMemory();
4084 return NULL;
4085 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4087 (_PyUnicode_LENGTH(unicode) + 1));
4088 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 PyErr_NoMemory();
4090 return NULL;
4091 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4093 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4094 w = _PyUnicode_WSTR(unicode);
4095 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4098 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 for (; w < wchar_end; ++one_byte, ++w)
4100 *w = *one_byte;
4101 /* null-terminate the wstr */
4102 *w = 0;
4103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++two_bytes, ++w)
4108 *w = *two_bytes;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111#else
4112 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004113 PyObject_FREE(_PyUnicode_WSTR(unicode));
4114 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 Py_FatalError("Impossible unicode object state, wstr "
4116 "and str should share memory already.");
4117 return NULL;
4118#endif
4119 }
4120 else {
4121 assert(0 && "This should never happen.");
4122 }
4123 }
4124 }
4125 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004126 *size = PyUnicode_WSTR_LENGTH(unicode);
4127 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004128}
4129
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130Py_UNICODE *
4131PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134}
4135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136
Alexander Belopolsky40018472011-02-26 01:02:56 +00004137Py_ssize_t
4138PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139{
4140 if (!PyUnicode_Check(unicode)) {
4141 PyErr_BadArgument();
4142 goto onError;
4143 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004144 if (_PyUnicode_WSTR(unicode) == NULL) {
4145 if (PyUnicode_AsUnicode(unicode) == NULL)
4146 goto onError;
4147 }
4148 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return -1;
4152}
4153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154Py_ssize_t
4155PyUnicode_GetLength(PyObject *unicode)
4156{
Victor Stinner07621332012-06-16 04:53:46 +02004157 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 PyErr_BadArgument();
4159 return -1;
4160 }
Victor Stinner07621332012-06-16 04:53:46 +02004161 if (PyUnicode_READY(unicode) == -1)
4162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004163 return PyUnicode_GET_LENGTH(unicode);
4164}
4165
4166Py_UCS4
4167PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4168{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004169 void *data;
4170 int kind;
4171
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004172 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4173 PyErr_BadArgument();
4174 return (Py_UCS4)-1;
4175 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004176 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004177 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 return (Py_UCS4)-1;
4179 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004180 data = PyUnicode_DATA(unicode);
4181 kind = PyUnicode_KIND(unicode);
4182 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183}
4184
4185int
4186PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4187{
4188 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004189 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004190 return -1;
4191 }
Victor Stinner488fa492011-12-12 00:01:39 +01004192 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004193 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004194 PyErr_SetString(PyExc_IndexError, "string index out of range");
4195 return -1;
4196 }
Victor Stinner488fa492011-12-12 00:01:39 +01004197 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004199 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4200 PyErr_SetString(PyExc_ValueError, "character out of range");
4201 return -1;
4202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004203 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4204 index, ch);
4205 return 0;
4206}
4207
Alexander Belopolsky40018472011-02-26 01:02:56 +00004208const char *
4209PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004210{
Victor Stinner42cb4622010-09-01 19:39:01 +00004211 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004212}
4213
Victor Stinner554f3f02010-06-16 23:33:54 +00004214/* create or adjust a UnicodeDecodeError */
4215static void
4216make_decode_exception(PyObject **exceptionObject,
4217 const char *encoding,
4218 const char *input, Py_ssize_t length,
4219 Py_ssize_t startpos, Py_ssize_t endpos,
4220 const char *reason)
4221{
4222 if (*exceptionObject == NULL) {
4223 *exceptionObject = PyUnicodeDecodeError_Create(
4224 encoding, input, length, startpos, endpos, reason);
4225 }
4226 else {
4227 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4228 goto onError;
4229 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4230 goto onError;
4231 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4232 goto onError;
4233 }
4234 return;
4235
4236onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004237 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004238}
4239
Steve Dowercc16be82016-09-08 10:35:16 -07004240#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241/* error handling callback helper:
4242 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004243 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 and adjust various state variables.
4245 return 0 on success, -1 on error
4246*/
4247
Alexander Belopolsky40018472011-02-26 01:02:56 +00004248static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004249unicode_decode_call_errorhandler_wchar(
4250 const char *errors, PyObject **errorHandler,
4251 const char *encoding, const char *reason,
4252 const char **input, const char **inend, Py_ssize_t *startinpos,
4253 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4254 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004256 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
4258 PyObject *restuple = NULL;
4259 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004260 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004261 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004262 Py_ssize_t requiredsize;
4263 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265 wchar_t *repwstr;
4266 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4269 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004270
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 *errorHandler = PyCodec_LookupError(errors);
4273 if (*errorHandler == NULL)
4274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 }
4276
Victor Stinner554f3f02010-06-16 23:33:54 +00004277 make_decode_exception(exceptionObject,
4278 encoding,
4279 *input, *inend - *input,
4280 *startinpos, *endinpos,
4281 reason);
4282 if (*exceptionObject == NULL)
4283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004285 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004289 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004292 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294
4295 /* Copy back the bytes variables, which might have been modified by the
4296 callback */
4297 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4298 if (!inputobj)
4299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 *input = PyBytes_AS_STRING(inputobj);
4301 insize = PyBytes_GET_SIZE(inputobj);
4302 *inend = *input + insize;
4303 /* we can DECREF safely, as the exception has another reference,
4304 so the object won't go away. */
4305 Py_DECREF(inputobj);
4306
4307 if (newpos<0)
4308 newpos = insize+newpos;
4309 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004310 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 goto onError;
4312 }
4313
4314 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4315 if (repwstr == NULL)
4316 goto onError;
4317 /* need more space? (at least enough for what we
4318 have+the replacement+the rest of the string (starting
4319 at the new input position), so we won't have to check space
4320 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004321 requiredsize = *outpos;
4322 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4323 goto overflow;
4324 requiredsize += repwlen;
4325 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4326 goto overflow;
4327 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004329 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 requiredsize = 2*outsize;
4331 if (unicode_resize(output, requiredsize) < 0)
4332 goto onError;
4333 }
4334 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4335 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 *endinpos = newpos;
4337 *inptr = *input + newpos;
4338
4339 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004340 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 return 0;
4342
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004343 overflow:
4344 PyErr_SetString(PyExc_OverflowError,
4345 "decoded result is too long for a Python string");
4346
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 onError:
4348 Py_XDECREF(restuple);
4349 return -1;
4350}
Steve Dowercc16be82016-09-08 10:35:16 -07004351#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352
4353static int
4354unicode_decode_call_errorhandler_writer(
4355 const char *errors, PyObject **errorHandler,
4356 const char *encoding, const char *reason,
4357 const char **input, const char **inend, Py_ssize_t *startinpos,
4358 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4359 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4360{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004361 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362
4363 PyObject *restuple = NULL;
4364 PyObject *repunicode = NULL;
4365 Py_ssize_t insize;
4366 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004367 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 PyObject *inputobj = NULL;
4369
4370 if (*errorHandler == NULL) {
4371 *errorHandler = PyCodec_LookupError(errors);
4372 if (*errorHandler == NULL)
4373 goto onError;
4374 }
4375
4376 make_decode_exception(exceptionObject,
4377 encoding,
4378 *input, *inend - *input,
4379 *startinpos, *endinpos,
4380 reason);
4381 if (*exceptionObject == NULL)
4382 goto onError;
4383
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385 if (restuple == NULL)
4386 goto onError;
4387 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004388 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389 goto onError;
4390 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004391 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004392 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004393
4394 /* Copy back the bytes variables, which might have been modified by the
4395 callback */
4396 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4397 if (!inputobj)
4398 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004399 *input = PyBytes_AS_STRING(inputobj);
4400 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004401 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004402 /* we can DECREF safely, as the exception has another reference,
4403 so the object won't go away. */
4404 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004408 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004409 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004411 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412
Victor Stinner170ca6f2013-04-18 00:25:28 +02004413 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004414 if (replen > 1) {
4415 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004416 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004417 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4418 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4419 goto onError;
4420 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004422 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004425 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004428 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434}
4435
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436/* --- UTF-7 Codec -------------------------------------------------------- */
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438/* See RFC2152 for details. We encode conservatively and decode liberally. */
4439
4440/* Three simple macros defining base-64. */
4441
4442/* Is c a base-64 character? */
4443
4444#define IS_BASE64(c) \
4445 (((c) >= 'A' && (c) <= 'Z') || \
4446 ((c) >= 'a' && (c) <= 'z') || \
4447 ((c) >= '0' && (c) <= '9') || \
4448 (c) == '+' || (c) == '/')
4449
4450/* given that c is a base-64 character, what is its base-64 value? */
4451
4452#define FROM_BASE64(c) \
4453 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4454 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4455 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4456 (c) == '+' ? 62 : 63)
4457
4458/* What is the base-64 character of the bottom 6 bits of n? */
4459
4460#define TO_BASE64(n) \
4461 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4462
4463/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4464 * decoded as itself. We are permissive on decoding; the only ASCII
4465 * byte not decoding to itself is the + which begins a base64
4466 * string. */
4467
4468#define DECODE_DIRECT(c) \
4469 ((c) <= 127 && (c) != '+')
4470
4471/* The UTF-7 encoder treats ASCII characters differently according to
4472 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4473 * the above). See RFC2152. This array identifies these different
4474 * sets:
4475 * 0 : "Set D"
4476 * alphanumeric and '(),-./:?
4477 * 1 : "Set O"
4478 * !"#$%&*;<=>@[]^_`{|}
4479 * 2 : "whitespace"
4480 * ht nl cr sp
4481 * 3 : special (must be base64 encoded)
4482 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4483 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Tim Petersced69f82003-09-16 20:30:58 +00004485static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486char utf7_category[128] = {
4487/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4488 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4489/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4490 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4491/* sp ! " # $ % & ' ( ) * + , - . / */
4492 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4493/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4495/* @ A B C D E F G H I J K L M N O */
4496 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4497/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4499/* ` a b c d e f g h i j k l m n o */
4500 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4501/* p q r s t u v w x y z { | } ~ del */
4502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503};
4504
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505/* ENCODE_DIRECT: this character should be encoded as itself. The
4506 * answer depends on whether we are encoding set O as itself, and also
4507 * on whether we are encoding whitespace as itself. RFC2152 makes it
4508 * clear that the answers to these questions vary between
4509 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004510
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511#define ENCODE_DIRECT(c, directO, directWS) \
4512 ((c) < 128 && (c) > 0 && \
4513 ((utf7_category[(c)] == 0) || \
4514 (directWS && (utf7_category[(c)] == 2)) || \
4515 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516
Alexander Belopolsky40018472011-02-26 01:02:56 +00004517PyObject *
4518PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004519 Py_ssize_t size,
4520 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4523}
4524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525/* The decoder. The only state we preserve is our read position,
4526 * i.e. how many characters we have consumed. So if we end in the
4527 * middle of a shift sequence we have to back off the read position
4528 * and the output to the beginning of the sequence, otherwise we lose
4529 * all the shift state (seen bits, number of bits seen, high
4530 * surrogate). */
4531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
4533PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004534 Py_ssize_t size,
4535 const char *errors,
4536 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 Py_ssize_t startinpos;
4540 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 const char *errmsg = "";
4544 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004545 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 unsigned int base64bits = 0;
4547 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004548 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 PyObject *errorHandler = NULL;
4550 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004551
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552 if (size == 0) {
4553 if (consumed)
4554 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004555 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004558 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004559 _PyUnicodeWriter_Init(&writer);
4560 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561
4562 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 e = s + size;
4564
4565 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004568 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (inShift) { /* in a base-64 section */
4571 if (IS_BASE64(ch)) { /* consume a base-64 character */
4572 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4573 base64bits += 6;
4574 s++;
4575 if (base64bits >= 16) {
4576 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004577 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 base64bits -= 16;
4579 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004580 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (surrogate) {
4582 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004583 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4584 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004585 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004586 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004588 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 }
4590 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004591 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004592 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 }
Victor Stinner551ac952011-11-29 22:58:13 +01004596 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 /* first surrogate */
4598 surrogate = outCh;
4599 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004601 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 }
4604 }
4605 }
4606 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 if (base64bits > 0) { /* left-over bits */
4609 if (base64bits >= 6) {
4610 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 errmsg = "partial character in shift sequence";
4613 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else {
4616 /* Some bits remain; they should be zero */
4617 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004618 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 errmsg = "non-zero padding bits in shift sequence";
4620 goto utf7Error;
4621 }
4622 }
4623 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004624 if (surrogate && DECODE_DIRECT(ch)) {
4625 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4626 goto onError;
4627 }
4628 surrogate = 0;
4629 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 /* '-' is absorbed; other terminating
4631 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004632 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 }
4635 }
4636 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 s++; /* consume '+' */
4639 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004641 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004642 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 }
4644 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004649 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 }
4651 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004654 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004655 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 else {
4658 startinpos = s-starts;
4659 s++;
4660 errmsg = "unexpected special character";
4661 goto utf7Error;
4662 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 errors, &errorHandler,
4668 "utf7", errmsg,
4669 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 }
4673
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 /* end of string */
4675
4676 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4677 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004678 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 if (surrogate ||
4680 (base64bits >= 6) ||
4681 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 errors, &errorHandler,
4685 "utf7", "unterminated shift sequence",
4686 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 goto onError;
4689 if (s < e)
4690 goto restart;
4691 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004692 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693
4694 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004695 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004697 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004698 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004699 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004700 writer.kind, writer.data, shiftOutStart);
4701 Py_XDECREF(errorHandler);
4702 Py_XDECREF(exc);
4703 _PyUnicodeWriter_Dealloc(&writer);
4704 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004705 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004706 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 }
4708 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004711 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713 Py_XDECREF(errorHandler);
4714 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 Py_XDECREF(errorHandler);
4719 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721 return NULL;
4722}
4723
4724
Alexander Belopolsky40018472011-02-26 01:02:56 +00004725PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004726_PyUnicode_EncodeUTF7(PyObject *str,
4727 int base64SetO,
4728 int base64WhiteSpace,
4729 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004731 int kind;
4732 void *data;
4733 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004734 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004736 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 unsigned int base64bits = 0;
4738 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 char * out;
4740 char * start;
4741
Benjamin Petersonbac79492012-01-14 13:34:47 -05004742 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 return NULL;
4744 kind = PyUnicode_KIND(str);
4745 data = PyUnicode_DATA(str);
4746 len = PyUnicode_GET_LENGTH(str);
4747
4748 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004752 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004753 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004754 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 if (v == NULL)
4756 return NULL;
4757
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004759 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004760 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 if (inShift) {
4763 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4764 /* shifting out */
4765 if (base64bits) { /* output remaining bits */
4766 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4767 base64buffer = 0;
4768 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769 }
4770 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 /* Characters not in the BASE64 set implicitly unshift the sequence
4772 so no '-' is required, except if the character is itself a '-' */
4773 if (IS_BASE64(ch) || ch == '-') {
4774 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 *out++ = (char) ch;
4777 }
4778 else {
4779 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004780 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 else { /* not in a shift sequence */
4783 if (ch == '+') {
4784 *out++ = '+';
4785 *out++ = '-';
4786 }
4787 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4788 *out++ = (char) ch;
4789 }
4790 else {
4791 *out++ = '+';
4792 inShift = 1;
4793 goto encode_char;
4794 }
4795 }
4796 continue;
4797encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004799 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004800
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 /* code first surrogate */
4802 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004803 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004804 while (base64bits >= 6) {
4805 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4806 base64bits -= 6;
4807 }
4808 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004809 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 base64bits += 16;
4812 base64buffer = (base64buffer << 16) | ch;
4813 while (base64bits >= 6) {
4814 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4815 base64bits -= 6;
4816 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004817 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004818 if (base64bits)
4819 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4820 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004821 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004822 if (_PyBytes_Resize(&v, out - start) < 0)
4823 return NULL;
4824 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004826PyObject *
4827PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4828 Py_ssize_t size,
4829 int base64SetO,
4830 int base64WhiteSpace,
4831 const char *errors)
4832{
4833 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004834 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004835 if (tmp == NULL)
4836 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004837 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838 base64WhiteSpace, errors);
4839 Py_DECREF(tmp);
4840 return result;
4841}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004842
Antoine Pitrou244651a2009-05-04 18:56:13 +00004843#undef IS_BASE64
4844#undef FROM_BASE64
4845#undef TO_BASE64
4846#undef DECODE_DIRECT
4847#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004848
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849/* --- UTF-8 Codec -------------------------------------------------------- */
4850
Alexander Belopolsky40018472011-02-26 01:02:56 +00004851PyObject *
4852PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004853 Py_ssize_t size,
4854 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855{
Walter Dörwald69652032004-09-07 20:24:22 +00004856 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4857}
4858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859#include "stringlib/asciilib.h"
4860#include "stringlib/codecs.h"
4861#include "stringlib/undef.h"
4862
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004863#include "stringlib/ucs1lib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
4867#include "stringlib/ucs2lib.h"
4868#include "stringlib/codecs.h"
4869#include "stringlib/undef.h"
4870
4871#include "stringlib/ucs4lib.h"
4872#include "stringlib/codecs.h"
4873#include "stringlib/undef.h"
4874
Antoine Pitrouab868312009-01-10 15:40:25 +00004875/* Mask to quickly check whether a C 'long' contains a
4876 non-ASCII, UTF8-encoded char. */
4877#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004878# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004879#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004880# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004881#else
4882# error C 'long' size should be either 4 or 8!
4883#endif
4884
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885static Py_ssize_t
4886ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004889 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004891 /*
4892 * Issue #17237: m68k is a bit different from most architectures in
4893 * that objects do not use "natural alignment" - for example, int and
4894 * long are only aligned at 2-byte boundaries. Therefore the assert()
4895 * won't work; also, tests have shown that skipping the "optimised
4896 * version" will even speed up m68k.
4897 */
4898#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004900 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4901 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 /* Fast path, see in STRINGLIB(utf8_decode) for
4903 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004904 /* Help allocation */
4905 const char *_p = p;
4906 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 while (_p < aligned_end) {
4908 unsigned long value = *(const unsigned long *) _p;
4909 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 *((unsigned long *)q) = value;
4912 _p += SIZEOF_LONG;
4913 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004914 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 p = _p;
4916 while (p < end) {
4917 if ((unsigned char)*p & 0x80)
4918 break;
4919 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004924#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 while (p < end) {
4926 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4927 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004928 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004929 /* Help allocation */
4930 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931 while (_p < aligned_end) {
4932 unsigned long value = *(unsigned long *) _p;
4933 if (value & ASCII_CHAR_MASK)
4934 break;
4935 _p += SIZEOF_LONG;
4936 }
4937 p = _p;
4938 if (_p == end)
4939 break;
4940 }
4941 if ((unsigned char)*p & 0x80)
4942 break;
4943 ++p;
4944 }
4945 memcpy(dest, start, p - start);
4946 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947}
Antoine Pitrouab868312009-01-10 15:40:25 +00004948
Victor Stinner785938e2011-12-11 20:09:03 +01004949PyObject *
4950PyUnicode_DecodeUTF8Stateful(const char *s,
4951 Py_ssize_t size,
4952 const char *errors,
4953 Py_ssize_t *consumed)
4954{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004955 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004956 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958
4959 Py_ssize_t startinpos;
4960 Py_ssize_t endinpos;
4961 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004962 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004964 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004965
4966 if (size == 0) {
4967 if (consumed)
4968 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004969 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004970 }
4971
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4973 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004974 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 *consumed = 1;
4976 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004977 }
4978
Victor Stinner8f674cc2013-04-17 23:02:17 +02004979 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004980 writer.min_length = size;
4981 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004983
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 writer.pos = ascii_decode(s, end, writer.data);
4985 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 while (s < end) {
4987 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004989
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 if (PyUnicode_IS_ASCII(writer.buffer))
4992 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 } else {
4998 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 }
5001
5002 switch (ch) {
5003 case 0:
5004 if (s == end || consumed)
5005 goto End;
5006 errmsg = "unexpected end of data";
5007 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005008 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 break;
5010 case 1:
5011 errmsg = "invalid start byte";
5012 startinpos = s - starts;
5013 endinpos = startinpos + 1;
5014 break;
5015 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005016 case 3:
5017 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 errmsg = "invalid continuation byte";
5019 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005020 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 break;
5022 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005023 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 goto onError;
5025 continue;
5026 }
5027
Victor Stinner1d65d912015-10-05 13:43:50 +02005028 if (error_handler == _Py_ERROR_UNKNOWN)
5029 error_handler = get_error_handler(errors);
5030
5031 switch (error_handler) {
5032 case _Py_ERROR_IGNORE:
5033 s += (endinpos - startinpos);
5034 break;
5035
5036 case _Py_ERROR_REPLACE:
5037 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5038 goto onError;
5039 s += (endinpos - startinpos);
5040 break;
5041
5042 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005043 {
5044 Py_ssize_t i;
5045
Victor Stinner1d65d912015-10-05 13:43:50 +02005046 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5047 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005048 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 ch = (Py_UCS4)(unsigned char)(starts[i]);
5050 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5051 ch + 0xdc00);
5052 writer.pos++;
5053 }
5054 s += (endinpos - startinpos);
5055 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005056 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005057
5058 default:
5059 if (unicode_decode_call_errorhandler_writer(
5060 errors, &error_handler_obj,
5061 "utf-8", errmsg,
5062 &starts, &end, &startinpos, &endinpos, &exc, &s,
5063 &writer))
5064 goto onError;
5065 }
Victor Stinner785938e2011-12-11 20:09:03 +01005066 }
5067
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 if (consumed)
5070 *consumed = s - starts;
5071
Victor Stinner1d65d912015-10-05 13:43:50 +02005072 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005074 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075
5076onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005077 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005079 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005081}
5082
Xavier de Gaye76febd02016-12-15 20:59:58 +01005083#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084
5085/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005086 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005087
5088 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005089 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005090
5091wchar_t*
5092_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5093{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 wchar_t *unicode;
5096 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097
5098 /* Note: size will always be longer than the resulting Unicode
5099 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005100 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005102 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103 if (!unicode)
5104 return NULL;
5105
5106 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005107 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 if (ch > 0xFF) {
5117#if SIZEOF_WCHAR_T == 4
5118 assert(0);
5119#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005120 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005121 /* compute and append the two surrogates: */
5122 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5123 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5124#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 else {
5127 if (!ch && s == e)
5128 break;
5129 /* surrogateescape */
5130 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5131 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134 return unicode;
5135}
5136
Xavier de Gaye76febd02016-12-15 20:59:58 +01005137#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139/* Primary internal function which creates utf8 encoded bytes objects.
5140
5141 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005142 and allocate exactly as much space needed at the end. Else allocate the
5143 maximum possible needed (4 result bytes per Unicode character), and return
5144 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005145*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005146PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005147_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148{
Victor Stinner6099a032011-12-18 14:22:26 +01005149 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 void *data;
5151 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 if (!PyUnicode_Check(unicode)) {
5154 PyErr_BadArgument();
5155 return NULL;
5156 }
5157
5158 if (PyUnicode_READY(unicode) == -1)
5159 return NULL;
5160
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005161 if (PyUnicode_UTF8(unicode))
5162 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5163 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164
5165 kind = PyUnicode_KIND(unicode);
5166 data = PyUnicode_DATA(unicode);
5167 size = PyUnicode_GET_LENGTH(unicode);
5168
Benjamin Petersonead6b532011-12-20 17:23:42 -06005169 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005170 default:
5171 assert(0);
5172 case PyUnicode_1BYTE_KIND:
5173 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5174 assert(!PyUnicode_IS_ASCII(unicode));
5175 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5176 case PyUnicode_2BYTE_KIND:
5177 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5178 case PyUnicode_4BYTE_KIND:
5179 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181}
5182
Alexander Belopolsky40018472011-02-26 01:02:56 +00005183PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005184PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5185 Py_ssize_t size,
5186 const char *errors)
5187{
5188 PyObject *v, *unicode;
5189
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005190 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 if (unicode == NULL)
5192 return NULL;
5193 v = _PyUnicode_AsUTF8String(unicode, errors);
5194 Py_DECREF(unicode);
5195 return v;
5196}
5197
5198PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005199PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005201 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
Walter Dörwald41980ca2007-08-16 21:55:45 +00005204/* --- UTF-32 Codec ------------------------------------------------------- */
5205
5206PyObject *
5207PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_ssize_t size,
5209 const char *errors,
5210 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005211{
5212 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5213}
5214
5215PyObject *
5216PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 Py_ssize_t size,
5218 const char *errors,
5219 int *byteorder,
5220 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005221{
5222 const char *starts = s;
5223 Py_ssize_t startinpos;
5224 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005226 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005227 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005229 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 PyObject *errorHandler = NULL;
5231 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005232
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 q = (unsigned char *)s;
5234 e = q + size;
5235
5236 if (byteorder)
5237 bo = *byteorder;
5238
5239 /* Check for BOM marks (U+FEFF) in the input and adjust current
5240 byte order setting accordingly. In native mode, the leading BOM
5241 mark is skipped, in all other modes, it is copied to the output
5242 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005244 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005245 if (bom == 0x0000FEFF) {
5246 bo = -1;
5247 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 else if (bom == 0xFFFE0000) {
5250 bo = 1;
5251 q += 4;
5252 }
5253 if (byteorder)
5254 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 }
5256
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 if (q == e) {
5258 if (consumed)
5259 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005260 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261 }
5262
Victor Stinnere64322e2012-10-30 23:12:47 +01005263#ifdef WORDS_BIGENDIAN
5264 le = bo < 0;
5265#else
5266 le = bo <= 0;
5267#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005268 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005269
Victor Stinner8f674cc2013-04-17 23:02:17 +02005270 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005271 writer.min_length = (e - q + 3) / 4;
5272 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005273 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005274
Victor Stinnere64322e2012-10-30 23:12:47 +01005275 while (1) {
5276 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005277 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 enum PyUnicode_Kind kind = writer.kind;
5281 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005284 if (le) {
5285 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005286 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (ch > maxch)
5288 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005289 if (kind != PyUnicode_1BYTE_KIND &&
5290 Py_UNICODE_IS_SURROGATE(ch))
5291 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005293 q += 4;
5294 } while (q <= last);
5295 }
5296 else {
5297 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005298 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (ch > maxch)
5300 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005301 if (kind != PyUnicode_1BYTE_KIND &&
5302 Py_UNICODE_IS_SURROGATE(ch))
5303 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 q += 4;
5306 } while (q <= last);
5307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 }
5310
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005311 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005312 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005313 startinpos = ((const char *)q) - starts;
5314 endinpos = startinpos + 4;
5315 }
5316 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005317 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 startinpos = ((const char *)q) - starts;
5322 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 else {
5325 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005326 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 goto onError;
5328 q += 4;
5329 continue;
5330 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005331 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 startinpos = ((const char *)q) - starts;
5333 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005335
5336 /* The remaining input chars are ignored if the callback
5337 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005338 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005342 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005344 }
5345
Walter Dörwald41980ca2007-08-16 21:55:45 +00005346 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005348
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 Py_XDECREF(errorHandler);
5350 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355 Py_XDECREF(errorHandler);
5356 Py_XDECREF(exc);
5357 return NULL;
5358}
5359
5360PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005361_PyUnicode_EncodeUTF32(PyObject *str,
5362 const char *errors,
5363 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005365 enum PyUnicode_Kind kind;
5366 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005367 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005368 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005369 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005370#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005371 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005373 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005377 PyObject *errorHandler = NULL;
5378 PyObject *exc = NULL;
5379 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005381 if (!PyUnicode_Check(str)) {
5382 PyErr_BadArgument();
5383 return NULL;
5384 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005385 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005386 return NULL;
5387 kind = PyUnicode_KIND(str);
5388 data = PyUnicode_DATA(str);
5389 len = PyUnicode_GET_LENGTH(str);
5390
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005392 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005394 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395 if (v == NULL)
5396 return NULL;
5397
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005398 /* output buffer is 4-bytes aligned */
5399 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005400 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005401 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005404 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005407 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005409 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 else
5411 encoding = "utf-32";
5412
5413 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5415 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 }
5417
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 pos = 0;
5419 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005420 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421
5422 if (kind == PyUnicode_2BYTE_KIND) {
5423 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5424 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 else {
5427 assert(kind == PyUnicode_4BYTE_KIND);
5428 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5429 &out, native_ordering);
5430 }
5431 if (pos == len)
5432 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005433
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 rep = unicode_encode_call_errorhandler(
5435 errors, &errorHandler,
5436 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 if (!rep)
5439 goto error;
5440
5441 if (PyBytes_Check(rep)) {
5442 repsize = PyBytes_GET_SIZE(rep);
5443 if (repsize & 3) {
5444 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 "surrogates not allowed");
5447 goto error;
5448 }
5449 moreunits = repsize / 4;
5450 }
5451 else {
5452 assert(PyUnicode_Check(rep));
5453 if (PyUnicode_READY(rep) < 0)
5454 goto error;
5455 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5456 if (!PyUnicode_IS_ASCII(rep)) {
5457 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005458 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 "surrogates not allowed");
5460 goto error;
5461 }
5462 }
5463
5464 /* four bytes are reserved for each surrogate */
5465 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005466 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 Py_ssize_t morebytes = 4 * (moreunits - 1);
5468 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5469 /* integer overflow */
5470 PyErr_NoMemory();
5471 goto error;
5472 }
5473 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5474 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005475 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005476 }
5477
5478 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005479 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5484 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 }
5486
5487 Py_CLEAR(rep);
5488 }
5489
5490 /* Cut back to size actually needed. This is necessary for, for example,
5491 encoding of a string containing isolated surrogates and the 'ignore'
5492 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005493 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 if (nsize != PyBytes_GET_SIZE(v))
5495 _PyBytes_Resize(&v, nsize);
5496 Py_XDECREF(errorHandler);
5497 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005499 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 error:
5501 Py_XDECREF(rep);
5502 Py_XDECREF(errorHandler);
5503 Py_XDECREF(exc);
5504 Py_XDECREF(v);
5505 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005506}
5507
Alexander Belopolsky40018472011-02-26 01:02:56 +00005508PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005509PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5510 Py_ssize_t size,
5511 const char *errors,
5512 int byteorder)
5513{
5514 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005515 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005516 if (tmp == NULL)
5517 return NULL;
5518 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5519 Py_DECREF(tmp);
5520 return result;
5521}
5522
5523PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005524PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005525{
Victor Stinnerb960b342011-11-20 19:12:52 +01005526 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527}
5528
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529/* --- UTF-16 Codec ------------------------------------------------------- */
5530
Tim Peters772747b2001-08-09 22:21:55 +00005531PyObject *
5532PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 Py_ssize_t size,
5534 const char *errors,
5535 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536{
Walter Dörwald69652032004-09-07 20:24:22 +00005537 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5538}
5539
5540PyObject *
5541PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 Py_ssize_t size,
5543 const char *errors,
5544 int *byteorder,
5545 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005546{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005548 Py_ssize_t startinpos;
5549 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005551 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005552 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005553 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005554 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 PyObject *errorHandler = NULL;
5556 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005557 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Tim Peters772747b2001-08-09 22:21:55 +00005559 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005560 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
5562 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005563 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005565 /* Check for BOM marks (U+FEFF) in the input and adjust current
5566 byte order setting accordingly. In native mode, the leading BOM
5567 mark is skipped, in all other modes, it is copied to the output
5568 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005569 if (bo == 0 && size >= 2) {
5570 const Py_UCS4 bom = (q[1] << 8) | q[0];
5571 if (bom == 0xFEFF) {
5572 q += 2;
5573 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005575 else if (bom == 0xFFFE) {
5576 q += 2;
5577 bo = 1;
5578 }
5579 if (byteorder)
5580 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 if (q == e) {
5584 if (consumed)
5585 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005586 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005587 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005588
Christian Heimes743e0cd2012-10-17 23:52:17 +02005589#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005592#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005595#endif
Tim Peters772747b2001-08-09 22:21:55 +00005596
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 /* Note: size will always be longer than the resulting Unicode
5598 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005599 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005600 writer.min_length = (e - q + 1) / 2;
5601 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 while (1) {
5605 Py_UCS4 ch = 0;
5606 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005607 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005609 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 native_ordering);
5613 else
5614 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 native_ordering);
5617 } else if (kind == PyUnicode_2BYTE_KIND) {
5618 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 native_ordering);
5621 } else {
5622 assert(kind == PyUnicode_4BYTE_KIND);
5623 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005626 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005627 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 switch (ch)
5630 {
5631 case 0:
5632 /* remaining byte at the end? (size should be even) */
5633 if (q == e || consumed)
5634 goto End;
5635 errmsg = "truncated data";
5636 startinpos = ((const char *)q) - starts;
5637 endinpos = ((const char *)e) - starts;
5638 break;
5639 /* The remaining input chars are ignored if the callback
5640 chooses to skip the input */
5641 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005642 q -= 2;
5643 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005644 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005645 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005646 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 endinpos = ((const char *)e) - starts;
5648 break;
5649 case 2:
5650 errmsg = "illegal encoding";
5651 startinpos = ((const char *)q) - 2 - starts;
5652 endinpos = startinpos + 2;
5653 break;
5654 case 3:
5655 errmsg = "illegal UTF-16 surrogate";
5656 startinpos = ((const char *)q) - 4 - starts;
5657 endinpos = startinpos + 2;
5658 break;
5659 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005660 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 continue;
5663 }
5664
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005665 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005666 errors,
5667 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005668 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005669 &starts,
5670 (const char **)&e,
5671 &startinpos,
5672 &endinpos,
5673 &exc,
5674 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005675 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
5678
Antoine Pitrou63065d72012-05-15 23:48:04 +02005679End:
Walter Dörwald69652032004-09-07 20:24:22 +00005680 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_XDECREF(errorHandler);
5684 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005685 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 Py_XDECREF(errorHandler);
5690 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return NULL;
5692}
5693
Tim Peters772747b2001-08-09 22:21:55 +00005694PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005695_PyUnicode_EncodeUTF16(PyObject *str,
5696 const char *errors,
5697 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005699 enum PyUnicode_Kind kind;
5700 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005701 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005702 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005703 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005704 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005705#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005706 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005707#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005708 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005709#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 const char *encoding;
5711 Py_ssize_t nsize, pos;
5712 PyObject *errorHandler = NULL;
5713 PyObject *exc = NULL;
5714 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005715
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 if (!PyUnicode_Check(str)) {
5717 PyErr_BadArgument();
5718 return NULL;
5719 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005720 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721 return NULL;
5722 kind = PyUnicode_KIND(str);
5723 data = PyUnicode_DATA(str);
5724 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005725
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005726 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 if (kind == PyUnicode_4BYTE_KIND) {
5728 const Py_UCS4 *in = (const Py_UCS4 *)data;
5729 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005730 while (in < end) {
5731 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005732 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 }
5734 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005735 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005736 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005738 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 nsize = len + pairs + (byteorder == 0);
5740 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005745 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005746 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005747 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 }
5751 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005752 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
Tim Peters772747b2001-08-09 22:21:55 +00005754
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 if (kind == PyUnicode_1BYTE_KIND) {
5756 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5757 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005758 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005759
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
5763 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769
5770 pos = 0;
5771 while (pos < len) {
5772 Py_ssize_t repsize, moreunits;
5773
5774 if (kind == PyUnicode_2BYTE_KIND) {
5775 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5776 &out, native_ordering);
5777 }
5778 else {
5779 assert(kind == PyUnicode_4BYTE_KIND);
5780 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5781 &out, native_ordering);
5782 }
5783 if (pos == len)
5784 break;
5785
5786 rep = unicode_encode_call_errorhandler(
5787 errors, &errorHandler,
5788 encoding, "surrogates not allowed",
5789 str, &exc, pos, pos + 1, &pos);
5790 if (!rep)
5791 goto error;
5792
5793 if (PyBytes_Check(rep)) {
5794 repsize = PyBytes_GET_SIZE(rep);
5795 if (repsize & 1) {
5796 raise_encode_exception(&exc, encoding,
5797 str, pos - 1, pos,
5798 "surrogates not allowed");
5799 goto error;
5800 }
5801 moreunits = repsize / 2;
5802 }
5803 else {
5804 assert(PyUnicode_Check(rep));
5805 if (PyUnicode_READY(rep) < 0)
5806 goto error;
5807 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5808 if (!PyUnicode_IS_ASCII(rep)) {
5809 raise_encode_exception(&exc, encoding,
5810 str, pos - 1, pos,
5811 "surrogates not allowed");
5812 goto error;
5813 }
5814 }
5815
5816 /* two bytes are reserved for each surrogate */
5817 if (moreunits > 1) {
5818 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5819 Py_ssize_t morebytes = 2 * (moreunits - 1);
5820 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5821 /* integer overflow */
5822 PyErr_NoMemory();
5823 goto error;
5824 }
5825 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5826 goto error;
5827 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5828 }
5829
5830 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005831 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 out += moreunits;
5833 } else /* rep is unicode */ {
5834 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5835 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5836 &out, native_ordering);
5837 }
5838
5839 Py_CLEAR(rep);
5840 }
5841
5842 /* Cut back to size actually needed. This is necessary for, for example,
5843 encoding of a string containing isolated surrogates and the 'ignore' handler
5844 is used. */
5845 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5846 if (nsize != PyBytes_GET_SIZE(v))
5847 _PyBytes_Resize(&v, nsize);
5848 Py_XDECREF(errorHandler);
5849 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005850 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005851 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005852 error:
5853 Py_XDECREF(rep);
5854 Py_XDECREF(errorHandler);
5855 Py_XDECREF(exc);
5856 Py_XDECREF(v);
5857 return NULL;
5858#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859}
5860
Alexander Belopolsky40018472011-02-26 01:02:56 +00005861PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5863 Py_ssize_t size,
5864 const char *errors,
5865 int byteorder)
5866{
5867 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005868 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869 if (tmp == NULL)
5870 return NULL;
5871 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5872 Py_DECREF(tmp);
5873 return result;
5874}
5875
5876PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005877PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
5882/* --- Unicode Escape Codec ----------------------------------------------- */
5883
Fredrik Lundh06d12682001-01-24 07:59:11 +00005884static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005887_PyUnicode_DecodeUnicodeEscape(const char *s,
5888 Py_ssize_t size,
5889 const char *errors,
5890 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005893 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 PyObject *errorHandler = NULL;
5896 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005897
Eric V. Smith42454af2016-10-31 09:22:08 -04005898 // so we can remember if we've seen an invalid escape char or not
5899 *first_invalid_escape = NULL;
5900
Victor Stinner62ec3312016-09-06 17:04:34 -07005901 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005902 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005903 }
5904 /* Escaped strings will always be longer than the resulting
5905 Unicode string, so we start with size here and then reduce the
5906 length after conversion to the true value.
5907 (but if the error callback returns a long replacement string
5908 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005909 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005910 writer.min_length = size;
5911 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5912 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005913 }
5914
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 end = s + size;
5916 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005917 unsigned char c = (unsigned char) *s++;
5918 Py_UCS4 ch;
5919 int count;
5920 Py_ssize_t startinpos;
5921 Py_ssize_t endinpos;
5922 const char *message;
5923
5924#define WRITE_ASCII_CHAR(ch) \
5925 do { \
5926 assert(ch <= 127); \
5927 assert(writer.pos < writer.size); \
5928 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5929 } while(0)
5930
5931#define WRITE_CHAR(ch) \
5932 do { \
5933 if (ch <= writer.maxchar) { \
5934 assert(writer.pos < writer.size); \
5935 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5936 } \
5937 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5938 goto onError; \
5939 } \
5940 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941
5942 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005943 if (c != '\\') {
5944 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 continue;
5946 }
5947
Victor Stinner62ec3312016-09-06 17:04:34 -07005948 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005950 if (s >= end) {
5951 message = "\\ at end of string";
5952 goto error;
5953 }
5954 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005955
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005957 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 case '\n': continue;
5961 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5962 case '\'': WRITE_ASCII_CHAR('\''); continue;
5963 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5964 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5967 case 't': WRITE_ASCII_CHAR('\t'); continue;
5968 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5969 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005972 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 case '0': case '1': case '2': case '3':
5977 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005979 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005980 ch = (ch<<3) + *s++ - '0';
5981 if (s < end && '0' <= *s && *s <= '7') {
5982 ch = (ch<<3) + *s++ - '0';
5983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 WRITE_CHAR(ch);
5986 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* hex escapes */
5989 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 message = "truncated \\xXX escape";
5993 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07005997 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005998 message = "truncated \\uXXXX escape";
5999 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006002 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006004 message = "truncated \\UXXXXXXXX escape";
6005 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006007 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006008 ch <<= 4;
6009 if (c >= '0' && c <= '9') {
6010 ch += c - '0';
6011 }
6012 else if (c >= 'a' && c <= 'f') {
6013 ch += c - ('a' - 10);
6014 }
6015 else if (c >= 'A' && c <= 'F') {
6016 ch += c - ('A' - 10);
6017 }
6018 else {
6019 break;
6020 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006021 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006023 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 }
6025
6026 /* when we get here, ch is a 32-bit unicode character */
6027 if (ch > MAX_UNICODE) {
6028 message = "illegal Unicode character";
6029 goto error;
6030 }
6031
6032 WRITE_CHAR(ch);
6033 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006034
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006036 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037 if (ucnhash_CAPI == NULL) {
6038 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006039 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6040 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006041 if (ucnhash_CAPI == NULL) {
6042 PyErr_SetString(
6043 PyExc_UnicodeError,
6044 "\\N escapes not supported (can't load unicodedata module)"
6045 );
6046 goto onError;
6047 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006049
6050 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006052 const char *start = ++s;
6053 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006057 namelen = s - start;
6058 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 ch = 0xffffffff; /* in case 'getcode' messes up */
6062 if (namelen <= INT_MAX &&
6063 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6064 &ch, 0)) {
6065 assert(ch <= MAX_UNICODE);
6066 WRITE_CHAR(ch);
6067 continue;
6068 }
6069 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070 }
6071 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006072 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073
6074 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006075 if (*first_invalid_escape == NULL) {
6076 *first_invalid_escape = s-1; /* Back up one char, since we've
6077 already incremented s. */
6078 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 WRITE_ASCII_CHAR('\\');
6080 WRITE_CHAR(c);
6081 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006083
6084 error:
6085 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006087 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006088 errors, &errorHandler,
6089 "unicodeescape", message,
6090 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006091 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006092 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006093 }
6094 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6095 goto onError;
6096 }
6097
6098#undef WRITE_ASCII_CHAR
6099#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006101
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006104 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006105
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 Py_XDECREF(errorHandler);
6109 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 return NULL;
6111}
6112
Eric V. Smith42454af2016-10-31 09:22:08 -04006113PyObject *
6114PyUnicode_DecodeUnicodeEscape(const char *s,
6115 Py_ssize_t size,
6116 const char *errors)
6117{
6118 const char *first_invalid_escape;
6119 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6120 &first_invalid_escape);
6121 if (result == NULL)
6122 return NULL;
6123 if (first_invalid_escape != NULL) {
6124 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6125 "invalid escape sequence '\\%c'",
6126 *first_invalid_escape) < 0) {
6127 Py_DECREF(result);
6128 return NULL;
6129 }
6130 }
6131 return result;
6132}
6133
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006134/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Alexander Belopolsky40018472011-02-26 01:02:56 +00006136PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Ezio Melottie7f90372012-10-05 03:33:31 +03006146 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006147 escape.
6148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 For UCS1 strings it's '\xxx', 4 bytes per source character.
6150 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6151 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006152 */
6153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 if (!PyUnicode_Check(unicode)) {
6155 PyErr_BadArgument();
6156 return NULL;
6157 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 }
Victor Stinner358af132015-10-12 22:36:57 +02006161
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 if (len == 0) {
6164 return PyBytes_FromStringAndSize(NULL, 0);
6165 }
6166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 kind = PyUnicode_KIND(unicode);
6168 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006169 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6170 bytes, and 1 byte characters 4. */
6171 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006172 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 return PyErr_NoMemory();
6174 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006175 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (repr == NULL) {
6177 return NULL;
6178 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006182 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006183
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 /* U+0000-U+00ff range */
6185 if (ch < 0x100) {
6186 if (ch >= ' ' && ch < 127) {
6187 if (ch != '\\') {
6188 /* Copy printable US ASCII as-is */
6189 *p++ = (char) ch;
6190 }
6191 /* Escape backslashes */
6192 else {
6193 *p++ = '\\';
6194 *p++ = '\\';
6195 }
6196 }
Victor Stinner358af132015-10-12 22:36:57 +02006197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 /* Map special whitespace to '\t', \n', '\r' */
6199 else if (ch == '\t') {
6200 *p++ = '\\';
6201 *p++ = 't';
6202 }
6203 else if (ch == '\n') {
6204 *p++ = '\\';
6205 *p++ = 'n';
6206 }
6207 else if (ch == '\r') {
6208 *p++ = '\\';
6209 *p++ = 'r';
6210 }
6211
6212 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6213 else {
6214 *p++ = '\\';
6215 *p++ = 'x';
6216 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6217 *p++ = Py_hexdigits[ch & 0x000F];
6218 }
Tim Petersced69f82003-09-16 20:30:58 +00006219 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006220 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 *p++ = '\\';
6223 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006224 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6225 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6226 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6227 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6230 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006231
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 /* Make sure that the first two digits are zero */
6233 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006234 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 *p++ = 'U';
6236 *p++ = '0';
6237 *p++ = '0';
6238 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6239 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6240 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6241 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6242 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6243 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 assert(p - PyBytes_AS_STRING(repr) > 0);
6248 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6249 return NULL;
6250 }
6251 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252}
6253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006255PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6256 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006259 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 }
6263
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006264 result = PyUnicode_AsUnicodeEscapeString(tmp);
6265 Py_DECREF(tmp);
6266 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267}
6268
6269/* --- Raw Unicode Escape Codec ------------------------------------------- */
6270
Alexander Belopolsky40018472011-02-26 01:02:56 +00006271PyObject *
6272PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006273 Py_ssize_t size,
6274 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006277 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 PyObject *errorHandler = NULL;
6280 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006283 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006285
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 /* Escaped strings will always be longer than the resulting
6287 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288 length after conversion to the true value. (But decoding error
6289 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006290 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 writer.min_length = size;
6292 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6293 goto onError;
6294 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 end = s + size;
6297 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 unsigned char c = (unsigned char) *s++;
6299 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006300 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 Py_ssize_t startinpos;
6302 Py_ssize_t endinpos;
6303 const char *message;
6304
6305#define WRITE_CHAR(ch) \
6306 do { \
6307 if (ch <= writer.maxchar) { \
6308 assert(writer.pos < writer.size); \
6309 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6310 } \
6311 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6312 goto onError; \
6313 } \
6314 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 if (c != '\\' || s >= end) {
6318 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006320 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006321
Victor Stinner62ec3312016-09-06 17:04:34 -07006322 c = (unsigned char) *s++;
6323 if (c == 'u') {
6324 count = 4;
6325 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 else if (c == 'U') {
6328 count = 8;
6329 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006330 }
6331 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 assert(writer.pos < writer.size);
6333 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6334 WRITE_CHAR(c);
6335 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006336 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006337 startinpos = s - starts - 2;
6338
6339 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6340 for (ch = 0; count && s < end; ++s, --count) {
6341 c = (unsigned char)*s;
6342 ch <<= 4;
6343 if (c >= '0' && c <= '9') {
6344 ch += c - '0';
6345 }
6346 else if (c >= 'a' && c <= 'f') {
6347 ch += c - ('a' - 10);
6348 }
6349 else if (c >= 'A' && c <= 'F') {
6350 ch += c - ('A' - 10);
6351 }
6352 else {
6353 break;
6354 }
6355 }
6356 if (!count) {
6357 if (ch <= MAX_UNICODE) {
6358 WRITE_CHAR(ch);
6359 continue;
6360 }
6361 message = "\\Uxxxxxxxx out of range";
6362 }
6363
6364 endinpos = s-starts;
6365 writer.min_length = end - s + writer.pos;
6366 if (unicode_decode_call_errorhandler_writer(
6367 errors, &errorHandler,
6368 "rawunicodeescape", message,
6369 &starts, &end, &startinpos, &endinpos, &exc, &s,
6370 &writer)) {
6371 goto onError;
6372 }
6373 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6374 goto onError;
6375 }
6376
6377#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 Py_XDECREF(errorHandler);
6380 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006381 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006382
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 Py_XDECREF(errorHandler);
6386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394{
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 int kind;
6399 void *data;
6400 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 if (!PyUnicode_Check(unicode)) {
6403 PyErr_BadArgument();
6404 return NULL;
6405 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409 kind = PyUnicode_KIND(unicode);
6410 data = PyUnicode_DATA(unicode);
6411 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 if (kind == PyUnicode_1BYTE_KIND) {
6413 return PyBytes_FromStringAndSize(data, len);
6414 }
Victor Stinner0e368262011-11-10 20:12:49 +01006415
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6417 bytes, and 1 byte characters 4. */
6418 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006419
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 if (len > PY_SSIZE_T_MAX / expandsize) {
6421 return PyErr_NoMemory();
6422 }
6423 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6424 if (repr == NULL) {
6425 return NULL;
6426 }
6427 if (len == 0) {
6428 return repr;
6429 }
6430
6431 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006432 for (pos = 0; pos < len; pos++) {
6433 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6436 if (ch < 0x100) {
6437 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006438 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6440 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 *p++ = '\\';
6442 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006443 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6444 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6445 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6446 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6449 else {
6450 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6451 *p++ = '\\';
6452 *p++ = 'U';
6453 *p++ = '0';
6454 *p++ = '0';
6455 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6456 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6460 *p++ = Py_hexdigits[ch & 15];
6461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006463
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 assert(p > PyBytes_AS_STRING(repr));
6465 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6466 return NULL;
6467 }
6468 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469}
6470
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006472PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006475 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006476 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006477 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006478 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6480 Py_DECREF(tmp);
6481 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484/* --- Unicode Internal Codec ------------------------------------------- */
6485
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006488 Py_ssize_t size,
6489 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006490{
6491 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 Py_ssize_t startinpos;
6493 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006494 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006495 const char *end;
6496 const char *reason;
6497 PyObject *errorHandler = NULL;
6498 PyObject *exc = NULL;
6499
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006500 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006501 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006502 1))
6503 return NULL;
6504
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006505 if (size == 0)
6506 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006507
Victor Stinner8f674cc2013-04-17 23:02:17 +02006508 _PyUnicodeWriter_Init(&writer);
6509 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6510 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006512 }
6513 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006514
Victor Stinner8f674cc2013-04-17 23:02:17 +02006515 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006516 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006517 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006518 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006519 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006520 endinpos = end-starts;
6521 reason = "truncated input";
6522 goto error;
6523 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006524 /* We copy the raw representation one byte at a time because the
6525 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006526 ((char *) &uch)[0] = s[0];
6527 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006528#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006529 ((char *) &uch)[2] = s[2];
6530 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006531#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006532 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006533#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 /* We have to sanity check the raw data, otherwise doom looms for
6535 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006536 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537 endinpos = s - starts + Py_UNICODE_SIZE;
6538 reason = "illegal code point (> 0x10FFFF)";
6539 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006540 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006542 s += Py_UNICODE_SIZE;
6543#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006544 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006545 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 Py_UNICODE uch2;
6547 ((char *) &uch2)[0] = s[0];
6548 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006549 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006550 {
Victor Stinner551ac952011-11-29 22:58:13 +01006551 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006552 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006553 }
6554 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006555#endif
6556
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006557 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006559 continue;
6560
6561 error:
6562 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006563 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006564 errors, &errorHandler,
6565 "unicode_internal", reason,
6566 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006567 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006568 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006569 }
6570
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 Py_XDECREF(errorHandler);
6572 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006573 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006576 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006577 Py_XDECREF(errorHandler);
6578 Py_XDECREF(exc);
6579 return NULL;
6580}
6581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582/* --- Latin-1 Codec ------------------------------------------------------ */
6583
Alexander Belopolsky40018472011-02-26 01:02:56 +00006584PyObject *
6585PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006586 Py_ssize_t size,
6587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006590 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006594static void
6595make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006596 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006597 PyObject *unicode,
6598 Py_ssize_t startpos, Py_ssize_t endpos,
6599 const char *reason)
6600{
6601 if (*exceptionObject == NULL) {
6602 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006603 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006604 encoding, unicode, startpos, endpos, reason);
6605 }
6606 else {
6607 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6608 goto onError;
6609 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6610 goto onError;
6611 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6612 goto onError;
6613 return;
6614 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006615 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006616 }
6617}
6618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006620static void
6621raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006622 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006623 PyObject *unicode,
6624 Py_ssize_t startpos, Py_ssize_t endpos,
6625 const char *reason)
6626{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006627 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006628 encoding, unicode, startpos, endpos, reason);
6629 if (*exceptionObject != NULL)
6630 PyCodec_StrictErrors(*exceptionObject);
6631}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632
6633/* error handling callback helper:
6634 build arguments, call the callback and check the arguments,
6635 put the result into newpos and return the replacement string, which
6636 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637static PyObject *
6638unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006639 PyObject **errorHandler,
6640 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 Py_ssize_t startpos, Py_ssize_t endpos,
6643 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006645 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 PyObject *restuple;
6648 PyObject *resunicode;
6649
6650 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654 }
6655
Benjamin Petersonbac79492012-01-14 13:34:47 -05006656 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 return NULL;
6658 len = PyUnicode_GET_LENGTH(unicode);
6659
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006660 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006665 restuple = PyObject_CallFunctionObjArgs(
6666 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006670 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 Py_DECREF(restuple);
6672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006674 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 &resunicode, newpos)) {
6676 Py_DECREF(restuple);
6677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006679 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6680 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6681 Py_DECREF(restuple);
6682 return NULL;
6683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 *newpos = len + *newpos;
6686 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006687 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 Py_DECREF(restuple);
6689 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 Py_INCREF(resunicode);
6692 Py_DECREF(restuple);
6693 return resunicode;
6694}
6695
Alexander Belopolsky40018472011-02-26 01:02:56 +00006696static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006697unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006698 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006699 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701 /* input state */
6702 Py_ssize_t pos=0, size;
6703 int kind;
6704 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 /* pointer into the output */
6706 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006707 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6708 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006709 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006711 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006712 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006713 /* output object */
6714 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715
Benjamin Petersonbac79492012-01-14 13:34:47 -05006716 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 return NULL;
6718 size = PyUnicode_GET_LENGTH(unicode);
6719 kind = PyUnicode_KIND(unicode);
6720 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 /* allocate enough for a simple encoding without
6722 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006723 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006724 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006725
6726 _PyBytesWriter_Init(&writer);
6727 str = _PyBytesWriter_Alloc(&writer, size);
6728 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006729 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006732 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006735 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006737 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006741 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006744 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006746
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006747 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006749
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006750 /* Only overallocate the buffer if it's not the last write */
6751 writer.overallocate = (collend < size);
6752
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006754 if (error_handler == _Py_ERROR_UNKNOWN)
6755 error_handler = get_error_handler(errors);
6756
6757 switch (error_handler) {
6758 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006759 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006761
6762 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006763 memset(str, '?', collend - collstart);
6764 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006765 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006766 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 break;
Victor Stinner50149202015-09-22 00:26:54 +02006769
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006770 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006771 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006772 writer.min_size -= (collend - collstart);
6773 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006774 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006775 if (str == NULL)
6776 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006777 pos = collend;
6778 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006779
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006780 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006781 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006782 writer.min_size -= (collend - collstart);
6783 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006784 unicode, collstart, collend);
6785 if (str == NULL)
6786 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 break;
Victor Stinner50149202015-09-22 00:26:54 +02006789
Victor Stinnerc3713e92015-09-29 12:32:13 +02006790 case _Py_ERROR_SURROGATEESCAPE:
6791 for (i = collstart; i < collend; ++i) {
6792 ch = PyUnicode_READ(kind, data, i);
6793 if (ch < 0xdc80 || 0xdcff < ch) {
6794 /* Not a UTF-8b surrogate */
6795 break;
6796 }
6797 *str++ = (char)(ch - 0xdc00);
6798 ++pos;
6799 }
6800 if (i >= collend)
6801 break;
6802 collstart = pos;
6803 assert(collstart != collend);
6804 /* fallback to general error handling */
6805
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006807 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6808 encoding, reason, unicode, &exc,
6809 collstart, collend, &newpos);
6810 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006812
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006813 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006814 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006815
Victor Stinner6bd525b2015-10-09 13:10:05 +02006816 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006817 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006818 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 PyBytes_AS_STRING(rep),
6820 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006821 if (str == NULL)
6822 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006823 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006824 else {
6825 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006826
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006829
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006830 if (limit == 256 ?
6831 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6832 !PyUnicode_IS_ASCII(rep))
6833 {
6834 /* Not all characters are smaller than limit */
6835 raise_encode_exception(&exc, encoding, unicode,
6836 collstart, collend, reason);
6837 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006839 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6840 str = _PyBytesWriter_WriteBytes(&writer, str,
6841 PyUnicode_DATA(rep),
6842 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006846 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006847
6848 /* If overallocation was disabled, ensure that it was the last
6849 write. Otherwise, we missed an optimization */
6850 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006851 }
6852 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006853
Victor Stinner50149202015-09-22 00:26:54 +02006854 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006856 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006857
6858 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006859 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006860 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006861 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006862 Py_XDECREF(exc);
6863 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864}
6865
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006866/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867PyObject *
6868PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006869 Py_ssize_t size,
6870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006873 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 if (unicode == NULL)
6875 return NULL;
6876 result = unicode_encode_ucs1(unicode, errors, 256);
6877 Py_DECREF(unicode);
6878 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006882_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883{
6884 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 PyErr_BadArgument();
6886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006888 if (PyUnicode_READY(unicode) == -1)
6889 return NULL;
6890 /* Fast path: if it is a one-byte string, construct
6891 bytes object directly. */
6892 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6893 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6894 PyUnicode_GET_LENGTH(unicode));
6895 /* Non-Latin-1 characters present. Defer to above function to
6896 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006897 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006898}
6899
6900PyObject*
6901PyUnicode_AsLatin1String(PyObject *unicode)
6902{
6903 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904}
6905
6906/* --- 7-bit ASCII Codec -------------------------------------------------- */
6907
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908PyObject *
6909PyUnicode_DecodeASCII(const char *s,
6910 Py_ssize_t size,
6911 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006914 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006915 int kind;
6916 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006917 Py_ssize_t startinpos;
6918 Py_ssize_t endinpos;
6919 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006921 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006923 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006924
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006926 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006929 if (size == 1 && (unsigned char)s[0] < 128)
6930 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006931
Victor Stinner8f674cc2013-04-17 23:02:17 +02006932 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006933 writer.min_length = size;
6934 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006935 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006938 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006939 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006940 writer.pos = outpos;
6941 if (writer.pos == size)
6942 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006943
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006944 s += writer.pos;
6945 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006947 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006949 PyUnicode_WRITE(kind, data, writer.pos, c);
6950 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006952 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006954
6955 /* byte outsize range 0x00..0x7f: call the error handler */
6956
6957 if (error_handler == _Py_ERROR_UNKNOWN)
6958 error_handler = get_error_handler(errors);
6959
6960 switch (error_handler)
6961 {
6962 case _Py_ERROR_REPLACE:
6963 case _Py_ERROR_SURROGATEESCAPE:
6964 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006965 but we may switch to UCS2 at the first write */
6966 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6967 goto onError;
6968 kind = writer.kind;
6969 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970
6971 if (error_handler == _Py_ERROR_REPLACE)
6972 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6973 else
6974 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6975 writer.pos++;
6976 ++s;
6977 break;
6978
6979 case _Py_ERROR_IGNORE:
6980 ++s;
6981 break;
6982
6983 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 startinpos = s-starts;
6985 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006986 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 "ascii", "ordinal not in range(128)",
6989 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006990 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006992 kind = writer.kind;
6993 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006996 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006997 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006998 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006999
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007002 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 return NULL;
7005}
7006
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007007/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008PyObject *
7009PyUnicode_EncodeASCII(const Py_UNICODE *p,
7010 Py_ssize_t size,
7011 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007014 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007015 if (unicode == NULL)
7016 return NULL;
7017 result = unicode_encode_ucs1(unicode, errors, 128);
7018 Py_DECREF(unicode);
7019 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020}
7021
Alexander Belopolsky40018472011-02-26 01:02:56 +00007022PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007023_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024{
7025 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 PyErr_BadArgument();
7027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007029 if (PyUnicode_READY(unicode) == -1)
7030 return NULL;
7031 /* Fast path: if it is an ASCII-only string, construct bytes object
7032 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007033 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007034 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7035 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037}
7038
7039PyObject *
7040PyUnicode_AsASCIIString(PyObject *unicode)
7041{
7042 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
Steve Dowercc16be82016-09-08 10:35:16 -07007045#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007046
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007047/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007048
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007049#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050#define NEED_RETRY
7051#endif
7052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053#ifndef WC_ERR_INVALID_CHARS
7054# define WC_ERR_INVALID_CHARS 0x0080
7055#endif
7056
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007057static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007058code_page_name(UINT code_page, PyObject **obj)
7059{
7060 *obj = NULL;
7061 if (code_page == CP_ACP)
7062 return "mbcs";
7063 if (code_page == CP_UTF7)
7064 return "CP_UTF7";
7065 if (code_page == CP_UTF8)
7066 return "CP_UTF8";
7067
7068 *obj = PyBytes_FromFormat("cp%u", code_page);
7069 if (*obj == NULL)
7070 return NULL;
7071 return PyBytes_AS_STRING(*obj);
7072}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074static DWORD
7075decode_code_page_flags(UINT code_page)
7076{
7077 if (code_page == CP_UTF7) {
7078 /* The CP_UTF7 decoder only supports flags=0 */
7079 return 0;
7080 }
7081 else
7082 return MB_ERR_INVALID_CHARS;
7083}
7084
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 * Decode a byte string from a Windows code page into unicode object in strict
7087 * mode.
7088 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007089 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7090 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007092static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007093decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007094 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 const char *in,
7096 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097{
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007099 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101
7102 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 assert(insize > 0);
7104 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7105 if (outsize <= 0)
7106 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107
7108 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007110 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 if (*v == NULL)
7113 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115 }
7116 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007119 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122 }
7123
7124 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7126 if (outsize <= 0)
7127 goto error;
7128 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007129
Victor Stinner3a50e702011-10-18 21:21:00 +02007130error:
7131 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7132 return -2;
7133 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007134 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007135}
7136
Victor Stinner3a50e702011-10-18 21:21:00 +02007137/*
7138 * Decode a byte string from a code page into unicode object with an error
7139 * handler.
7140 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007141 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 * UnicodeDecodeError exception and returns -1 on error.
7143 */
7144static int
7145decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007146 PyObject **v,
7147 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007148 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007149{
7150 const char *startin = in;
7151 const char *endin = in + size;
7152 const DWORD flags = decode_code_page_flags(code_page);
7153 /* Ideally, we should get reason from FormatMessage. This is the Windows
7154 2000 English version of the message. */
7155 const char *reason = "No mapping for the Unicode character exists "
7156 "in the target code page.";
7157 /* each step cannot decode more than 1 character, but a character can be
7158 represented as a surrogate pair */
7159 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007160 int insize;
7161 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 PyObject *errorHandler = NULL;
7163 PyObject *exc = NULL;
7164 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007165 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 DWORD err;
7167 int ret = -1;
7168
7169 assert(size > 0);
7170
7171 encoding = code_page_name(code_page, &encoding_obj);
7172 if (encoding == NULL)
7173 return -1;
7174
Victor Stinner7d00cc12014-03-17 23:08:06 +01007175 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7177 UnicodeDecodeError. */
7178 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7179 if (exc != NULL) {
7180 PyCodec_StrictErrors(exc);
7181 Py_CLEAR(exc);
7182 }
7183 goto error;
7184 }
7185
7186 if (*v == NULL) {
7187 /* Create unicode object */
7188 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7189 PyErr_NoMemory();
7190 goto error;
7191 }
Victor Stinnerab595942011-12-17 04:59:06 +01007192 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007193 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 if (*v == NULL)
7195 goto error;
7196 startout = PyUnicode_AS_UNICODE(*v);
7197 }
7198 else {
7199 /* Extend unicode object */
7200 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7201 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7202 PyErr_NoMemory();
7203 goto error;
7204 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007205 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 goto error;
7207 startout = PyUnicode_AS_UNICODE(*v) + n;
7208 }
7209
7210 /* Decode the byte string character per character */
7211 out = startout;
7212 while (in < endin)
7213 {
7214 /* Decode a character */
7215 insize = 1;
7216 do
7217 {
7218 outsize = MultiByteToWideChar(code_page, flags,
7219 in, insize,
7220 buffer, Py_ARRAY_LENGTH(buffer));
7221 if (outsize > 0)
7222 break;
7223 err = GetLastError();
7224 if (err != ERROR_NO_UNICODE_TRANSLATION
7225 && err != ERROR_INSUFFICIENT_BUFFER)
7226 {
7227 PyErr_SetFromWindowsErr(0);
7228 goto error;
7229 }
7230 insize++;
7231 }
7232 /* 4=maximum length of a UTF-8 sequence */
7233 while (insize <= 4 && (in + insize) <= endin);
7234
7235 if (outsize <= 0) {
7236 Py_ssize_t startinpos, endinpos, outpos;
7237
Victor Stinner7d00cc12014-03-17 23:08:06 +01007238 /* last character in partial decode? */
7239 if (in + insize >= endin && !final)
7240 break;
7241
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 startinpos = in - startin;
7243 endinpos = startinpos + 1;
7244 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007245 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 errors, &errorHandler,
7247 encoding, reason,
7248 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007249 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 {
7251 goto error;
7252 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007253 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 }
7255 else {
7256 in += insize;
7257 memcpy(out, buffer, outsize * sizeof(wchar_t));
7258 out += outsize;
7259 }
7260 }
7261
7262 /* write a NUL character at the end */
7263 *out = 0;
7264
7265 /* Extend unicode object */
7266 outsize = out - startout;
7267 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007268 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007270 /* (in - startin) <= size and size is an int */
7271 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007272
7273error:
7274 Py_XDECREF(encoding_obj);
7275 Py_XDECREF(errorHandler);
7276 Py_XDECREF(exc);
7277 return ret;
7278}
7279
Victor Stinner3a50e702011-10-18 21:21:00 +02007280static PyObject *
7281decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007282 const char *s, Py_ssize_t size,
7283 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284{
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 PyObject *v = NULL;
7286 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 if (code_page < 0) {
7289 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7290 return NULL;
7291 }
7292
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007295
Victor Stinner76a31a62011-11-04 00:05:13 +01007296 do
7297 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 if (size > INT_MAX) {
7300 chunk_size = INT_MAX;
7301 final = 0;
7302 done = 0;
7303 }
7304 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 {
7307 chunk_size = (int)size;
7308 final = (consumed == NULL);
7309 done = 1;
7310 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311
Victor Stinner76a31a62011-11-04 00:05:13 +01007312 if (chunk_size == 0 && done) {
7313 if (v != NULL)
7314 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007315 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007316 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317
Victor Stinner76a31a62011-11-04 00:05:13 +01007318 converted = decode_code_page_strict(code_page, &v,
7319 s, chunk_size);
7320 if (converted == -2)
7321 converted = decode_code_page_errors(code_page, &v,
7322 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007323 errors, final);
7324 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007325
7326 if (converted < 0) {
7327 Py_XDECREF(v);
7328 return NULL;
7329 }
7330
7331 if (consumed)
7332 *consumed += converted;
7333
7334 s += converted;
7335 size -= converted;
7336 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007337
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007338 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339}
7340
Alexander Belopolsky40018472011-02-26 01:02:56 +00007341PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007342PyUnicode_DecodeCodePageStateful(int code_page,
7343 const char *s,
7344 Py_ssize_t size,
7345 const char *errors,
7346 Py_ssize_t *consumed)
7347{
7348 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7349}
7350
7351PyObject *
7352PyUnicode_DecodeMBCSStateful(const char *s,
7353 Py_ssize_t size,
7354 const char *errors,
7355 Py_ssize_t *consumed)
7356{
7357 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7358}
7359
7360PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361PyUnicode_DecodeMBCS(const char *s,
7362 Py_ssize_t size,
7363 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007364{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7366}
7367
Victor Stinner3a50e702011-10-18 21:21:00 +02007368static DWORD
7369encode_code_page_flags(UINT code_page, const char *errors)
7370{
7371 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007372 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 }
7374 else if (code_page == CP_UTF7) {
7375 /* CP_UTF7 only supports flags=0 */
7376 return 0;
7377 }
7378 else {
7379 if (errors != NULL && strcmp(errors, "replace") == 0)
7380 return 0;
7381 else
7382 return WC_NO_BEST_FIT_CHARS;
7383 }
7384}
7385
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 * Encode a Unicode string to a Windows code page into a byte string in strict
7388 * mode.
7389 *
7390 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007391 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007393static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007394encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007395 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397{
Victor Stinner554f3f02010-06-16 23:33:54 +00007398 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 BOOL *pusedDefaultChar = &usedDefaultChar;
7400 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007401 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007402 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 const DWORD flags = encode_code_page_flags(code_page, NULL);
7404 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 /* Create a substring so that we can get the UTF-16 representation
7406 of just the slice under consideration. */
7407 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007408
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007412 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007414 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007415
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 substring = PyUnicode_Substring(unicode, offset, offset+len);
7417 if (substring == NULL)
7418 return -1;
7419 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7420 if (p == NULL) {
7421 Py_DECREF(substring);
7422 return -1;
7423 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007424 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007426 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007428 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 NULL, 0,
7430 NULL, pusedDefaultChar);
7431 if (outsize <= 0)
7432 goto error;
7433 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 if (pusedDefaultChar && *pusedDefaultChar) {
7435 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007438
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007442 if (*outbytes == NULL) {
7443 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447 }
7448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 const Py_ssize_t n = PyBytes_Size(*outbytes);
7451 if (outsize > PY_SSIZE_T_MAX - n) {
7452 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007453 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7457 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461 }
7462
7463 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007465 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 out, outsize,
7467 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (outsize <= 0)
7470 goto error;
7471 if (pusedDefaultChar && *pusedDefaultChar)
7472 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007474
Victor Stinner3a50e702011-10-18 21:21:00 +02007475error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7478 return -2;
7479 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007480 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007481}
7482
Victor Stinner3a50e702011-10-18 21:21:00 +02007483/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007484 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 * error handler.
7486 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007487 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 * -1 on other error.
7489 */
7490static int
7491encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007493 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007494{
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_ssize_t pos = unicode_offset;
7497 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 /* Ideally, we should get reason from FormatMessage. This is the Windows
7499 2000 English version of the message. */
7500 const char *reason = "invalid character";
7501 /* 4=maximum length of a UTF-8 sequence */
7502 char buffer[4];
7503 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7504 Py_ssize_t outsize;
7505 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 PyObject *errorHandler = NULL;
7507 PyObject *exc = NULL;
7508 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007509 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007510 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 PyObject *rep;
7512 int ret = -1;
7513
7514 assert(insize > 0);
7515
7516 encoding = code_page_name(code_page, &encoding_obj);
7517 if (encoding == NULL)
7518 return -1;
7519
7520 if (errors == NULL || strcmp(errors, "strict") == 0) {
7521 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7522 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007523 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 if (exc != NULL) {
7525 PyCodec_StrictErrors(exc);
7526 Py_DECREF(exc);
7527 }
7528 Py_XDECREF(encoding_obj);
7529 return -1;
7530 }
7531
7532 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7533 pusedDefaultChar = &usedDefaultChar;
7534 else
7535 pusedDefaultChar = NULL;
7536
7537 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7538 PyErr_NoMemory();
7539 goto error;
7540 }
7541 outsize = insize * Py_ARRAY_LENGTH(buffer);
7542
7543 if (*outbytes == NULL) {
7544 /* Create string object */
7545 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7546 if (*outbytes == NULL)
7547 goto error;
7548 out = PyBytes_AS_STRING(*outbytes);
7549 }
7550 else {
7551 /* Extend string object */
7552 Py_ssize_t n = PyBytes_Size(*outbytes);
7553 if (n > PY_SSIZE_T_MAX - outsize) {
7554 PyErr_NoMemory();
7555 goto error;
7556 }
7557 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7558 goto error;
7559 out = PyBytes_AS_STRING(*outbytes) + n;
7560 }
7561
7562 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007563 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007565 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7566 wchar_t chars[2];
7567 int charsize;
7568 if (ch < 0x10000) {
7569 chars[0] = (wchar_t)ch;
7570 charsize = 1;
7571 }
7572 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007573 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7574 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007575 charsize = 2;
7576 }
7577
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 buffer, Py_ARRAY_LENGTH(buffer),
7581 NULL, pusedDefaultChar);
7582 if (outsize > 0) {
7583 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7584 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007585 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 memcpy(out, buffer, outsize);
7587 out += outsize;
7588 continue;
7589 }
7590 }
7591 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7592 PyErr_SetFromWindowsErr(0);
7593 goto error;
7594 }
7595
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 rep = unicode_encode_call_errorhandler(
7597 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007598 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 if (rep == NULL)
7601 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007603
7604 if (PyBytes_Check(rep)) {
7605 outsize = PyBytes_GET_SIZE(rep);
7606 if (outsize != 1) {
7607 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7608 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7609 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7610 Py_DECREF(rep);
7611 goto error;
7612 }
7613 out = PyBytes_AS_STRING(*outbytes) + offset;
7614 }
7615 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7616 out += outsize;
7617 }
7618 else {
7619 Py_ssize_t i;
7620 enum PyUnicode_Kind kind;
7621 void *data;
7622
Benjamin Petersonbac79492012-01-14 13:34:47 -05007623 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 Py_DECREF(rep);
7625 goto error;
7626 }
7627
7628 outsize = PyUnicode_GET_LENGTH(rep);
7629 if (outsize != 1) {
7630 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7631 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7632 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7633 Py_DECREF(rep);
7634 goto error;
7635 }
7636 out = PyBytes_AS_STRING(*outbytes) + offset;
7637 }
7638 kind = PyUnicode_KIND(rep);
7639 data = PyUnicode_DATA(rep);
7640 for (i=0; i < outsize; i++) {
7641 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7642 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007643 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007644 encoding, unicode,
7645 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 "unable to encode error handler result to ASCII");
7647 Py_DECREF(rep);
7648 goto error;
7649 }
7650 *out = (unsigned char)ch;
7651 out++;
7652 }
7653 }
7654 Py_DECREF(rep);
7655 }
7656 /* write a NUL byte */
7657 *out = 0;
7658 outsize = out - PyBytes_AS_STRING(*outbytes);
7659 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7660 if (_PyBytes_Resize(outbytes, outsize) < 0)
7661 goto error;
7662 ret = 0;
7663
7664error:
7665 Py_XDECREF(encoding_obj);
7666 Py_XDECREF(errorHandler);
7667 Py_XDECREF(exc);
7668 return ret;
7669}
7670
Victor Stinner3a50e702011-10-18 21:21:00 +02007671static PyObject *
7672encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007673 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 const char *errors)
7675{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007676 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007678 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007679 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007680
Victor Stinner29dacf22015-01-26 16:41:32 +01007681 if (!PyUnicode_Check(unicode)) {
7682 PyErr_BadArgument();
7683 return NULL;
7684 }
7685
Benjamin Petersonbac79492012-01-14 13:34:47 -05007686 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007687 return NULL;
7688 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007689
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 if (code_page < 0) {
7691 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7692 return NULL;
7693 }
7694
Martin v. Löwis3d325192011-11-04 18:23:06 +01007695 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007696 return PyBytes_FromStringAndSize(NULL, 0);
7697
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 offset = 0;
7699 do
7700 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007703 chunks. */
7704 if (len > INT_MAX/2) {
7705 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007706 done = 0;
7707 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007708 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007709#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007710 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007711 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007712 done = 1;
7713 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007714
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 errors);
7718 if (ret == -2)
7719 ret = encode_code_page_errors(code_page, &outbytes,
7720 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007722 if (ret < 0) {
7723 Py_XDECREF(outbytes);
7724 return NULL;
7725 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726
Victor Stinner7581cef2011-11-03 22:32:33 +01007727 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007729 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730
Victor Stinner3a50e702011-10-18 21:21:00 +02007731 return outbytes;
7732}
7733
7734PyObject *
7735PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7736 Py_ssize_t size,
7737 const char *errors)
7738{
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007740 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007741 if (unicode == NULL)
7742 return NULL;
7743 res = encode_code_page(CP_ACP, unicode, errors);
7744 Py_DECREF(unicode);
7745 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007746}
7747
7748PyObject *
7749PyUnicode_EncodeCodePage(int code_page,
7750 PyObject *unicode,
7751 const char *errors)
7752{
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007754}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007755
Alexander Belopolsky40018472011-02-26 01:02:56 +00007756PyObject *
7757PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007758{
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007760}
7761
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007762#undef NEED_RETRY
7763
Steve Dowercc16be82016-09-08 10:35:16 -07007764#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766/* --- Character Mapping Codec -------------------------------------------- */
7767
Victor Stinnerfb161b12013-04-18 01:44:27 +02007768static int
7769charmap_decode_string(const char *s,
7770 Py_ssize_t size,
7771 PyObject *mapping,
7772 const char *errors,
7773 _PyUnicodeWriter *writer)
7774{
7775 const char *starts = s;
7776 const char *e;
7777 Py_ssize_t startinpos, endinpos;
7778 PyObject *errorHandler = NULL, *exc = NULL;
7779 Py_ssize_t maplen;
7780 enum PyUnicode_Kind mapkind;
7781 void *mapdata;
7782 Py_UCS4 x;
7783 unsigned char ch;
7784
7785 if (PyUnicode_READY(mapping) == -1)
7786 return -1;
7787
7788 maplen = PyUnicode_GET_LENGTH(mapping);
7789 mapdata = PyUnicode_DATA(mapping);
7790 mapkind = PyUnicode_KIND(mapping);
7791
7792 e = s + size;
7793
7794 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7795 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7796 * is disabled in encoding aliases, latin1 is preferred because
7797 * its implementation is faster. */
7798 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7799 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7800 Py_UCS4 maxchar = writer->maxchar;
7801
7802 assert (writer->kind == PyUnicode_1BYTE_KIND);
7803 while (s < e) {
7804 ch = *s;
7805 x = mapdata_ucs1[ch];
7806 if (x > maxchar) {
7807 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7808 goto onError;
7809 maxchar = writer->maxchar;
7810 outdata = (Py_UCS1 *)writer->data;
7811 }
7812 outdata[writer->pos] = x;
7813 writer->pos++;
7814 ++s;
7815 }
7816 return 0;
7817 }
7818
7819 while (s < e) {
7820 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7821 enum PyUnicode_Kind outkind = writer->kind;
7822 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7823 if (outkind == PyUnicode_1BYTE_KIND) {
7824 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7825 Py_UCS4 maxchar = writer->maxchar;
7826 while (s < e) {
7827 ch = *s;
7828 x = mapdata_ucs2[ch];
7829 if (x > maxchar)
7830 goto Error;
7831 outdata[writer->pos] = x;
7832 writer->pos++;
7833 ++s;
7834 }
7835 break;
7836 }
7837 else if (outkind == PyUnicode_2BYTE_KIND) {
7838 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7839 while (s < e) {
7840 ch = *s;
7841 x = mapdata_ucs2[ch];
7842 if (x == 0xFFFE)
7843 goto Error;
7844 outdata[writer->pos] = x;
7845 writer->pos++;
7846 ++s;
7847 }
7848 break;
7849 }
7850 }
7851 ch = *s;
7852
7853 if (ch < maplen)
7854 x = PyUnicode_READ(mapkind, mapdata, ch);
7855 else
7856 x = 0xfffe; /* invalid value */
7857Error:
7858 if (x == 0xfffe)
7859 {
7860 /* undefined mapping */
7861 startinpos = s-starts;
7862 endinpos = startinpos+1;
7863 if (unicode_decode_call_errorhandler_writer(
7864 errors, &errorHandler,
7865 "charmap", "character maps to <undefined>",
7866 &starts, &e, &startinpos, &endinpos, &exc, &s,
7867 writer)) {
7868 goto onError;
7869 }
7870 continue;
7871 }
7872
7873 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7874 goto onError;
7875 ++s;
7876 }
7877 Py_XDECREF(errorHandler);
7878 Py_XDECREF(exc);
7879 return 0;
7880
7881onError:
7882 Py_XDECREF(errorHandler);
7883 Py_XDECREF(exc);
7884 return -1;
7885}
7886
7887static int
7888charmap_decode_mapping(const char *s,
7889 Py_ssize_t size,
7890 PyObject *mapping,
7891 const char *errors,
7892 _PyUnicodeWriter *writer)
7893{
7894 const char *starts = s;
7895 const char *e;
7896 Py_ssize_t startinpos, endinpos;
7897 PyObject *errorHandler = NULL, *exc = NULL;
7898 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007899 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007900
7901 e = s + size;
7902
7903 while (s < e) {
7904 ch = *s;
7905
7906 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7907 key = PyLong_FromLong((long)ch);
7908 if (key == NULL)
7909 goto onError;
7910
7911 item = PyObject_GetItem(mapping, key);
7912 Py_DECREF(key);
7913 if (item == NULL) {
7914 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7915 /* No mapping found means: mapping is undefined. */
7916 PyErr_Clear();
7917 goto Undefined;
7918 } else
7919 goto onError;
7920 }
7921
7922 /* Apply mapping */
7923 if (item == Py_None)
7924 goto Undefined;
7925 if (PyLong_Check(item)) {
7926 long value = PyLong_AS_LONG(item);
7927 if (value == 0xFFFE)
7928 goto Undefined;
7929 if (value < 0 || value > MAX_UNICODE) {
7930 PyErr_Format(PyExc_TypeError,
7931 "character mapping must be in range(0x%lx)",
7932 (unsigned long)MAX_UNICODE + 1);
7933 goto onError;
7934 }
7935
7936 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7937 goto onError;
7938 }
7939 else if (PyUnicode_Check(item)) {
7940 if (PyUnicode_READY(item) == -1)
7941 goto onError;
7942 if (PyUnicode_GET_LENGTH(item) == 1) {
7943 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7944 if (value == 0xFFFE)
7945 goto Undefined;
7946 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7947 goto onError;
7948 }
7949 else {
7950 writer->overallocate = 1;
7951 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7952 goto onError;
7953 }
7954 }
7955 else {
7956 /* wrong return value */
7957 PyErr_SetString(PyExc_TypeError,
7958 "character mapping must return integer, None or str");
7959 goto onError;
7960 }
7961 Py_CLEAR(item);
7962 ++s;
7963 continue;
7964
7965Undefined:
7966 /* undefined mapping */
7967 Py_CLEAR(item);
7968 startinpos = s-starts;
7969 endinpos = startinpos+1;
7970 if (unicode_decode_call_errorhandler_writer(
7971 errors, &errorHandler,
7972 "charmap", "character maps to <undefined>",
7973 &starts, &e, &startinpos, &endinpos, &exc, &s,
7974 writer)) {
7975 goto onError;
7976 }
7977 }
7978 Py_XDECREF(errorHandler);
7979 Py_XDECREF(exc);
7980 return 0;
7981
7982onError:
7983 Py_XDECREF(item);
7984 Py_XDECREF(errorHandler);
7985 Py_XDECREF(exc);
7986 return -1;
7987}
7988
Alexander Belopolsky40018472011-02-26 01:02:56 +00007989PyObject *
7990PyUnicode_DecodeCharmap(const char *s,
7991 Py_ssize_t size,
7992 PyObject *mapping,
7993 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007995 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007996
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 /* Default to Latin-1 */
7998 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008002 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008003 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008004 writer.min_length = size;
8005 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008007
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008008 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008009 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8010 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008011 }
8012 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008013 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008017
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008019 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 return NULL;
8021}
8022
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023/* Charmap encoding: the lookup table */
8024
Alexander Belopolsky40018472011-02-26 01:02:56 +00008025struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 PyObject_HEAD
8027 unsigned char level1[32];
8028 int count2, count3;
8029 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030};
8031
8032static PyObject*
8033encoding_map_size(PyObject *obj, PyObject* args)
8034{
8035 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038}
8039
8040static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008041 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 PyDoc_STR("Return the size (in bytes) of this object") },
8043 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044};
8045
8046static void
8047encoding_map_dealloc(PyObject* o)
8048{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050}
8051
8052static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008053 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 "EncodingMap", /*tp_name*/
8055 sizeof(struct encoding_map), /*tp_basicsize*/
8056 0, /*tp_itemsize*/
8057 /* methods */
8058 encoding_map_dealloc, /*tp_dealloc*/
8059 0, /*tp_print*/
8060 0, /*tp_getattr*/
8061 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008062 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 0, /*tp_repr*/
8064 0, /*tp_as_number*/
8065 0, /*tp_as_sequence*/
8066 0, /*tp_as_mapping*/
8067 0, /*tp_hash*/
8068 0, /*tp_call*/
8069 0, /*tp_str*/
8070 0, /*tp_getattro*/
8071 0, /*tp_setattro*/
8072 0, /*tp_as_buffer*/
8073 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8074 0, /*tp_doc*/
8075 0, /*tp_traverse*/
8076 0, /*tp_clear*/
8077 0, /*tp_richcompare*/
8078 0, /*tp_weaklistoffset*/
8079 0, /*tp_iter*/
8080 0, /*tp_iternext*/
8081 encoding_map_methods, /*tp_methods*/
8082 0, /*tp_members*/
8083 0, /*tp_getset*/
8084 0, /*tp_base*/
8085 0, /*tp_dict*/
8086 0, /*tp_descr_get*/
8087 0, /*tp_descr_set*/
8088 0, /*tp_dictoffset*/
8089 0, /*tp_init*/
8090 0, /*tp_alloc*/
8091 0, /*tp_new*/
8092 0, /*tp_free*/
8093 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094};
8095
8096PyObject*
8097PyUnicode_BuildEncodingMap(PyObject* string)
8098{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 PyObject *result;
8100 struct encoding_map *mresult;
8101 int i;
8102 int need_dict = 0;
8103 unsigned char level1[32];
8104 unsigned char level2[512];
8105 unsigned char *mlevel1, *mlevel2, *mlevel3;
8106 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 int kind;
8108 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008109 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008112 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 PyErr_BadArgument();
8114 return NULL;
8115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 kind = PyUnicode_KIND(string);
8117 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008118 length = PyUnicode_GET_LENGTH(string);
8119 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 memset(level1, 0xFF, sizeof level1);
8121 memset(level2, 0xFF, sizeof level2);
8122
8123 /* If there isn't a one-to-one mapping of NULL to \0,
8124 or if there are non-BMP characters, we need to use
8125 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008128 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 ch = PyUnicode_READ(kind, data, i);
8131 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 need_dict = 1;
8133 break;
8134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 /* unmapped character */
8137 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 l1 = ch >> 11;
8139 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 if (level1[l1] == 0xFF)
8141 level1[l1] = count2++;
8142 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 }
8145
8146 if (count2 >= 0xFF || count3 >= 0xFF)
8147 need_dict = 1;
8148
8149 if (need_dict) {
8150 PyObject *result = PyDict_New();
8151 PyObject *key, *value;
8152 if (!result)
8153 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008154 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008156 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 if (!key || !value)
8158 goto failed1;
8159 if (PyDict_SetItem(result, key, value) == -1)
8160 goto failed1;
8161 Py_DECREF(key);
8162 Py_DECREF(value);
8163 }
8164 return result;
8165 failed1:
8166 Py_XDECREF(key);
8167 Py_XDECREF(value);
8168 Py_DECREF(result);
8169 return NULL;
8170 }
8171
8172 /* Create a three-level trie */
8173 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8174 16*count2 + 128*count3 - 1);
8175 if (!result)
8176 return PyErr_NoMemory();
8177 PyObject_Init(result, &EncodingMapType);
8178 mresult = (struct encoding_map*)result;
8179 mresult->count2 = count2;
8180 mresult->count3 = count3;
8181 mlevel1 = mresult->level1;
8182 mlevel2 = mresult->level23;
8183 mlevel3 = mresult->level23 + 16*count2;
8184 memcpy(mlevel1, level1, 32);
8185 memset(mlevel2, 0xFF, 16*count2);
8186 memset(mlevel3, 0, 128*count3);
8187 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008188 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008190 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8191 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 /* unmapped character */
8193 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008194 o1 = ch>>11;
8195 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 i2 = 16*mlevel1[o1] + o2;
8197 if (mlevel2[i2] == 0xFF)
8198 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008199 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 i3 = 128*mlevel2[i2] + o3;
8201 mlevel3[i3] = i;
8202 }
8203 return result;
8204}
8205
8206static int
Victor Stinner22168992011-11-20 17:09:18 +01008207encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008208{
8209 struct encoding_map *map = (struct encoding_map*)mapping;
8210 int l1 = c>>11;
8211 int l2 = (c>>7) & 0xF;
8212 int l3 = c & 0x7F;
8213 int i;
8214
Victor Stinner22168992011-11-20 17:09:18 +01008215 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 if (c == 0)
8218 return 0;
8219 /* level 1*/
8220 i = map->level1[l1];
8221 if (i == 0xFF) {
8222 return -1;
8223 }
8224 /* level 2*/
8225 i = map->level23[16*i+l2];
8226 if (i == 0xFF) {
8227 return -1;
8228 }
8229 /* level 3 */
8230 i = map->level23[16*map->count2 + 128*i + l3];
8231 if (i == 0) {
8232 return -1;
8233 }
8234 return i;
8235}
8236
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237/* Lookup the character ch in the mapping. If the character
8238 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008239 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008240static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008241charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242{
Christian Heimes217cfd12007-12-02 14:31:20 +00008243 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 PyObject *x;
8245
8246 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 x = PyObject_GetItem(mapping, w);
8249 Py_DECREF(w);
8250 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8252 /* No mapping found means: mapping is undefined. */
8253 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008254 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 } else
8256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008258 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008260 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 long value = PyLong_AS_LONG(x);
8262 if (value < 0 || value > 255) {
8263 PyErr_SetString(PyExc_TypeError,
8264 "character mapping must be in range(256)");
8265 Py_DECREF(x);
8266 return NULL;
8267 }
8268 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008270 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 /* wrong return value */
8274 PyErr_Format(PyExc_TypeError,
8275 "character mapping must return integer, bytes or None, not %.400s",
8276 x->ob_type->tp_name);
8277 Py_DECREF(x);
8278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 }
8280}
8281
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008282static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008283charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008284{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008285 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8286 /* exponentially overallocate to minimize reallocations */
8287 if (requiredsize < 2*outsize)
8288 requiredsize = 2*outsize;
8289 if (_PyBytes_Resize(outobj, requiredsize))
8290 return -1;
8291 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292}
8293
Benjamin Peterson14339b62009-01-31 16:36:08 +00008294typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008296} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008298 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 space is available. Return a new reference to the object that
8300 was put in the output buffer, or Py_None, if the mapping was undefined
8301 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008302 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008303static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008304charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008305 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 PyObject *rep;
8308 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008309 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310
Christian Heimes90aa7642007-12-19 02:45:37 +00008311 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 if (res == -1)
8315 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 if (outsize<requiredsize)
8317 if (charmapencode_resize(outobj, outpos, requiredsize))
8318 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008319 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 outstart[(*outpos)++] = (char)res;
8321 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 }
8323
8324 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 Py_DECREF(rep);
8329 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 if (PyLong_Check(rep)) {
8332 Py_ssize_t requiredsize = *outpos+1;
8333 if (outsize<requiredsize)
8334 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8335 Py_DECREF(rep);
8336 return enc_EXCEPTION;
8337 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008338 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 else {
8342 const char *repchars = PyBytes_AS_STRING(rep);
8343 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8344 Py_ssize_t requiredsize = *outpos+repsize;
8345 if (outsize<requiredsize)
8346 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8347 Py_DECREF(rep);
8348 return enc_EXCEPTION;
8349 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008350 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 memcpy(outstart + *outpos, repchars, repsize);
8352 *outpos += repsize;
8353 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 Py_DECREF(rep);
8356 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357}
8358
8359/* handle an error in PyUnicode_EncodeCharmap
8360 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361static int
8362charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008365 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008366 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
8368 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008371 enum PyUnicode_Kind kind;
8372 void *data;
8373 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008375 Py_ssize_t collstartpos = *inpos;
8376 Py_ssize_t collendpos = *inpos+1;
8377 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 char *encoding = "charmap";
8379 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008381 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008382 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383
Benjamin Petersonbac79492012-01-14 13:34:47 -05008384 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 return -1;
8386 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 /* find all unencodable characters */
8388 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008389 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008390 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008391 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008392 val = encoding_map_lookup(ch, mapping);
8393 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 break;
8395 ++collendpos;
8396 continue;
8397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8400 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 if (rep==NULL)
8402 return -1;
8403 else if (rep!=Py_None) {
8404 Py_DECREF(rep);
8405 break;
8406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409 }
8410 /* cache callback name lookup
8411 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008412 if (*error_handler == _Py_ERROR_UNKNOWN)
8413 *error_handler = get_error_handler(errors);
8414
8415 switch (*error_handler) {
8416 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008417 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008419
8420 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 x = charmapencode_output('?', mapping, res, respos);
8423 if (x==enc_EXCEPTION) {
8424 return -1;
8425 }
8426 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008427 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return -1;
8429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 }
8431 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008432 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 *inpos = collendpos;
8434 break;
Victor Stinner50149202015-09-22 00:26:54 +02008435
8436 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 /* generate replacement (temporarily (mis)uses p) */
8438 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 char buffer[2+29+1+1];
8440 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 for (cp = buffer; *cp; ++cp) {
8443 x = charmapencode_output(*cp, mapping, res, respos);
8444 if (x==enc_EXCEPTION)
8445 return -1;
8446 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return -1;
8449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 }
8451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 *inpos = collendpos;
8453 break;
Victor Stinner50149202015-09-22 00:26:54 +02008454
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 default:
Victor Stinner50149202015-09-22 00:26:54 +02008456 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008457 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008461 if (PyBytes_Check(repunicode)) {
8462 /* Directly copy bytes result to output. */
8463 Py_ssize_t outsize = PyBytes_Size(*res);
8464 Py_ssize_t requiredsize;
8465 repsize = PyBytes_Size(repunicode);
8466 requiredsize = *respos + repsize;
8467 if (requiredsize > outsize)
8468 /* Make room for all additional bytes. */
8469 if (charmapencode_resize(res, respos, requiredsize)) {
8470 Py_DECREF(repunicode);
8471 return -1;
8472 }
8473 memcpy(PyBytes_AsString(*res) + *respos,
8474 PyBytes_AsString(repunicode), repsize);
8475 *respos += repsize;
8476 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008477 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008478 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008481 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008482 Py_DECREF(repunicode);
8483 return -1;
8484 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008485 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008486 data = PyUnicode_DATA(repunicode);
8487 kind = PyUnicode_KIND(repunicode);
8488 for (index = 0; index < repsize; index++) {
8489 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8490 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008492 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 return -1;
8494 }
8495 else if (x==enc_FAILED) {
8496 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008497 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return -1;
8499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 }
8501 *inpos = newpos;
8502 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 }
8504 return 0;
8505}
8506
Alexander Belopolsky40018472011-02-26 01:02:56 +00008507PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508_PyUnicode_EncodeCharmap(PyObject *unicode,
8509 PyObject *mapping,
8510 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 /* output object */
8513 PyObject *res = NULL;
8514 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008515 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008518 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008519 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008521 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008522 void *data;
8523 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
Benjamin Petersonbac79492012-01-14 13:34:47 -05008525 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008526 return NULL;
8527 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008528 data = PyUnicode_DATA(unicode);
8529 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 /* Default to Latin-1 */
8532 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 /* allocate enough for a simple encoding without
8536 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008537 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 if (res == NULL)
8539 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008540 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008544 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 if (x==enc_EXCEPTION) /* error */
8548 goto onError;
8549 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008552 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 &res, &respos)) {
8554 goto onError;
8555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 else
8558 /* done with this character => adjust input position */
8559 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008563 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008564 if (_PyBytes_Resize(&res, respos) < 0)
8565 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008568 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 return res;
8570
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 Py_XDECREF(res);
8573 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008574 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 return NULL;
8576}
8577
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578/* Deprecated */
8579PyObject *
8580PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8581 Py_ssize_t size,
8582 PyObject *mapping,
8583 const char *errors)
8584{
8585 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008586 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587 if (unicode == NULL)
8588 return NULL;
8589 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8590 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008591 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008592}
8593
Alexander Belopolsky40018472011-02-26 01:02:56 +00008594PyObject *
8595PyUnicode_AsCharmapString(PyObject *unicode,
8596 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597{
8598 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 PyErr_BadArgument();
8600 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603}
8604
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606static void
8607make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609 Py_ssize_t startpos, Py_ssize_t endpos,
8610 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 *exceptionObject = _PyUnicodeTranslateError_Create(
8614 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
8616 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8618 goto onError;
8619 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8620 goto onError;
8621 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8622 goto onError;
8623 return;
8624 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008625 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
8627}
8628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629/* error handling callback helper:
8630 build arguments, call the callback and check the arguments,
8631 put the result into newpos and return the replacement string, which
8632 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633static PyObject *
8634unicode_translate_call_errorhandler(const char *errors,
8635 PyObject **errorHandler,
8636 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638 Py_ssize_t startpos, Py_ssize_t endpos,
8639 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008641 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008643 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 PyObject *restuple;
8645 PyObject *resunicode;
8646
8647 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 }
8652
8653 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008658 restuple = PyObject_CallFunctionObjArgs(
8659 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008663 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 Py_DECREF(restuple);
8665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008667 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 &resunicode, &i_newpos)) {
8669 Py_DECREF(restuple);
8670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008672 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008674 else
8675 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008677 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 Py_DECREF(restuple);
8679 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008680 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 Py_INCREF(resunicode);
8682 Py_DECREF(restuple);
8683 return resunicode;
8684}
8685
8686/* Lookup the character ch in the mapping and put the result in result,
8687 which must be decrefed by the caller.
8688 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691{
Christian Heimes217cfd12007-12-02 14:31:20 +00008692 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 PyObject *x;
8694
8695 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 x = PyObject_GetItem(mapping, w);
8698 Py_DECREF(w);
8699 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8701 /* No mapping found means: use 1:1 mapping. */
8702 PyErr_Clear();
8703 *result = NULL;
8704 return 0;
8705 } else
8706 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 }
8708 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 *result = x;
8710 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008712 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008714 if (value < 0 || value > MAX_UNICODE) {
8715 PyErr_Format(PyExc_ValueError,
8716 "character mapping must be in range(0x%x)",
8717 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 Py_DECREF(x);
8719 return -1;
8720 }
8721 *result = x;
8722 return 0;
8723 }
8724 else if (PyUnicode_Check(x)) {
8725 *result = x;
8726 return 0;
8727 }
8728 else {
8729 /* wrong return value */
8730 PyErr_SetString(PyExc_TypeError,
8731 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 Py_DECREF(x);
8733 return -1;
8734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735}
Victor Stinner1194ea02014-04-04 19:37:40 +02008736
8737/* lookup the character, write the result into the writer.
8738 Return 1 if the result was written into the writer, return 0 if the mapping
8739 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008740static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008741charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8742 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743{
Victor Stinner1194ea02014-04-04 19:37:40 +02008744 PyObject *item;
8745
8746 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008748
8749 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008751 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008754 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008756
8757 if (item == Py_None) {
8758 Py_DECREF(item);
8759 return 0;
8760 }
8761
8762 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008763 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8764 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8765 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008766 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8767 Py_DECREF(item);
8768 return -1;
8769 }
8770 Py_DECREF(item);
8771 return 1;
8772 }
8773
8774 if (!PyUnicode_Check(item)) {
8775 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008777 }
8778
8779 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8780 Py_DECREF(item);
8781 return -1;
8782 }
8783
8784 Py_DECREF(item);
8785 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786}
8787
Victor Stinner89a76ab2014-04-05 11:44:04 +02008788static int
8789unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8790 Py_UCS1 *translate)
8791{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008792 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008793 int ret = 0;
8794
Victor Stinner89a76ab2014-04-05 11:44:04 +02008795 if (charmaptranslate_lookup(ch, mapping, &item)) {
8796 return -1;
8797 }
8798
8799 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008800 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008801 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008802 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008803 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008804 /* not found => default to 1:1 mapping */
8805 translate[ch] = ch;
8806 return 1;
8807 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008808 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008809 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008810 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8811 used it */
8812 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813 /* invalid character or character outside ASCII:
8814 skip the fast translate */
8815 goto exit;
8816 }
8817 translate[ch] = (Py_UCS1)replace;
8818 }
8819 else if (PyUnicode_Check(item)) {
8820 Py_UCS4 replace;
8821
8822 if (PyUnicode_READY(item) == -1) {
8823 Py_DECREF(item);
8824 return -1;
8825 }
8826 if (PyUnicode_GET_LENGTH(item) != 1)
8827 goto exit;
8828
8829 replace = PyUnicode_READ_CHAR(item, 0);
8830 if (replace > 127)
8831 goto exit;
8832 translate[ch] = (Py_UCS1)replace;
8833 }
8834 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008835 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 goto exit;
8837 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838 ret = 1;
8839
Benjamin Peterson1365de72014-04-07 20:15:41 -04008840 exit:
8841 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842 return ret;
8843}
8844
8845/* Fast path for ascii => ascii translation. Return 1 if the whole string
8846 was translated into writer, return 0 if the input string was partially
8847 translated into writer, raise an exception and return -1 on error. */
8848static int
8849unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008850 _PyUnicodeWriter *writer, int ignore,
8851 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852{
Victor Stinner872b2912014-04-05 14:27:07 +02008853 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 Py_ssize_t len;
8855 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008856 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 len = PyUnicode_GET_LENGTH(input);
8859
Victor Stinner872b2912014-04-05 14:27:07 +02008860 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861
8862 in = PyUnicode_1BYTE_DATA(input);
8863 end = in + len;
8864
8865 assert(PyUnicode_IS_ASCII(writer->buffer));
8866 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8867 out = PyUnicode_1BYTE_DATA(writer->buffer);
8868
Victor Stinner872b2912014-04-05 14:27:07 +02008869 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008871 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008873 int translate = unicode_fast_translate_lookup(mapping, ch,
8874 ascii_table);
8875 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008877 if (translate == 0)
8878 goto exit;
8879 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 }
Victor Stinner872b2912014-04-05 14:27:07 +02008881 if (ch2 == 0xfe) {
8882 if (ignore)
8883 continue;
8884 goto exit;
8885 }
8886 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008888 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008889 }
Victor Stinner872b2912014-04-05 14:27:07 +02008890 res = 1;
8891
8892exit:
8893 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008894 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008895 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896}
8897
Victor Stinner3222da22015-10-01 22:07:32 +02008898static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899_PyUnicode_TranslateCharmap(PyObject *input,
8900 PyObject *mapping,
8901 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008904 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 Py_ssize_t size, i;
8906 int kind;
8907 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 _PyUnicodeWriter writer;
8909 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 char *reason = "character maps to <undefined>";
8911 PyObject *errorHandler = NULL;
8912 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 PyErr_BadArgument();
8918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 if (PyUnicode_READY(input) == -1)
8922 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008923 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 kind = PyUnicode_KIND(input);
8925 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008927 if (size == 0)
8928 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 /* allocate enough for a simple 1:1 translation without
8931 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 _PyUnicodeWriter_Init(&writer);
8933 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935
Victor Stinner872b2912014-04-05 14:27:07 +02008936 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8937
Victor Stinner33798672016-03-01 21:59:58 +01008938 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008940 if (PyUnicode_IS_ASCII(input)) {
8941 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8942 if (res < 0) {
8943 _PyUnicodeWriter_Dealloc(&writer);
8944 return NULL;
8945 }
8946 if (res == 1)
8947 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008948 }
Victor Stinner33798672016-03-01 21:59:58 +01008949 else {
8950 i = 0;
8951 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 int translate;
8956 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8957 Py_ssize_t newpos;
8958 /* startpos for collecting untranslatable chars */
8959 Py_ssize_t collstart;
8960 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008961 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 ch = PyUnicode_READ(kind, data, i);
8964 translate = charmaptranslate_output(ch, mapping, &writer);
8965 if (translate < 0)
8966 goto onError;
8967
8968 if (translate != 0) {
8969 /* it worked => adjust input pointer */
8970 ++i;
8971 continue;
8972 }
8973
8974 /* untranslatable character */
8975 collstart = i;
8976 collend = i+1;
8977
8978 /* find all untranslatable characters */
8979 while (collend < size) {
8980 PyObject *x;
8981 ch = PyUnicode_READ(kind, data, collend);
8982 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008983 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 Py_XDECREF(x);
8985 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 ++collend;
8988 }
8989
8990 if (ignore) {
8991 i = collend;
8992 }
8993 else {
8994 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8995 reason, input, &exc,
8996 collstart, collend, &newpos);
8997 if (repunicode == NULL)
8998 goto onError;
8999 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009001 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009002 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009003 Py_DECREF(repunicode);
9004 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009005 }
9006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009007 Py_XDECREF(exc);
9008 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009013 Py_XDECREF(exc);
9014 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 return NULL;
9016}
9017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018/* Deprecated. Use PyUnicode_Translate instead. */
9019PyObject *
9020PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9021 Py_ssize_t size,
9022 PyObject *mapping,
9023 const char *errors)
9024{
Christian Heimes5f520f42012-09-11 14:03:25 +02009025 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009026 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 if (!unicode)
9028 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009029 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9030 Py_DECREF(unicode);
9031 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032}
9033
Alexander Belopolsky40018472011-02-26 01:02:56 +00009034PyObject *
9035PyUnicode_Translate(PyObject *str,
9036 PyObject *mapping,
9037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009039 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009040 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009041 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042}
Tim Petersced69f82003-09-16 20:30:58 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009045fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046{
9047 /* No need to call PyUnicode_READY(self) because this function is only
9048 called as a callback from fixup() which does it already. */
9049 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9050 const int kind = PyUnicode_KIND(self);
9051 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009052 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009053 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 Py_ssize_t i;
9055
9056 for (i = 0; i < len; ++i) {
9057 ch = PyUnicode_READ(kind, data, i);
9058 fixed = 0;
9059 if (ch > 127) {
9060 if (Py_UNICODE_ISSPACE(ch))
9061 fixed = ' ';
9062 else {
9063 const int decimal = Py_UNICODE_TODECIMAL(ch);
9064 if (decimal >= 0)
9065 fixed = '0' + decimal;
9066 }
9067 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009068 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009069 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 PyUnicode_WRITE(kind, data, i, fixed);
9071 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009072 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009073 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 }
9076
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009077 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078}
9079
9080PyObject *
9081_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9082{
9083 if (!PyUnicode_Check(unicode)) {
9084 PyErr_BadInternalCall();
9085 return NULL;
9086 }
9087 if (PyUnicode_READY(unicode) == -1)
9088 return NULL;
9089 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9090 /* If the string is already ASCII, just return the same string */
9091 Py_INCREF(unicode);
9092 return unicode;
9093 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009094 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095}
9096
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009097PyObject *
9098PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9099 Py_ssize_t length)
9100{
Victor Stinnerf0124502011-11-21 23:12:56 +01009101 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009102 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009103 Py_UCS4 maxchar;
9104 enum PyUnicode_Kind kind;
9105 void *data;
9106
Victor Stinner99d7ad02012-02-22 13:37:39 +01009107 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009108 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009109 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009110 if (ch > 127) {
9111 int decimal = Py_UNICODE_TODECIMAL(ch);
9112 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009113 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009114 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009115 }
9116 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009117
9118 /* Copy to a new string */
9119 decimal = PyUnicode_New(length, maxchar);
9120 if (decimal == NULL)
9121 return decimal;
9122 kind = PyUnicode_KIND(decimal);
9123 data = PyUnicode_DATA(decimal);
9124 /* Iterate over code points */
9125 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009126 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009127 if (ch > 127) {
9128 int decimal = Py_UNICODE_TODECIMAL(ch);
9129 if (decimal >= 0)
9130 ch = '0' + decimal;
9131 }
9132 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009134 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009135}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009136/* --- Decimal Encoder ---------------------------------------------------- */
9137
Alexander Belopolsky40018472011-02-26 01:02:56 +00009138int
9139PyUnicode_EncodeDecimal(Py_UNICODE *s,
9140 Py_ssize_t length,
9141 char *output,
9142 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009143{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009144 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009145 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009146 enum PyUnicode_Kind kind;
9147 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009148
9149 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 PyErr_BadArgument();
9151 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009152 }
9153
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009154 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009155 if (unicode == NULL)
9156 return -1;
9157
Victor Stinner42bf7752011-11-21 22:52:58 +01009158 kind = PyUnicode_KIND(unicode);
9159 data = PyUnicode_DATA(unicode);
9160
Victor Stinnerb84d7232011-11-22 01:50:07 +01009161 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009162 PyObject *exc;
9163 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009165 Py_ssize_t startpos;
9166
9167 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009168
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009170 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009171 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 decimal = Py_UNICODE_TODECIMAL(ch);
9175 if (decimal >= 0) {
9176 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009177 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 continue;
9179 }
9180 if (0 < ch && ch < 256) {
9181 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 continue;
9184 }
Victor Stinner6345be92011-11-25 20:09:01 +01009185
Victor Stinner42bf7752011-11-21 22:52:58 +01009186 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009187 exc = NULL;
9188 raise_encode_exception(&exc, "decimal", unicode,
9189 startpos, startpos+1,
9190 "invalid decimal Unicode string");
9191 Py_XDECREF(exc);
9192 Py_DECREF(unicode);
9193 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009194 }
9195 /* 0-terminate the output string */
9196 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009197 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009199}
9200
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201/* --- Helpers ------------------------------------------------------------ */
9202
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009203/* helper macro to fixup start/end slice values */
9204#define ADJUST_INDICES(start, end, len) \
9205 if (end > len) \
9206 end = len; \
9207 else if (end < 0) { \
9208 end += len; \
9209 if (end < 0) \
9210 end = 0; \
9211 } \
9212 if (start < 0) { \
9213 start += len; \
9214 if (start < 0) \
9215 start = 0; \
9216 }
9217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009219any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009221 Py_ssize_t end,
9222 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 void *buf1, *buf2;
9226 Py_ssize_t len1, len2, result;
9227
9228 kind1 = PyUnicode_KIND(s1);
9229 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009230 if (kind1 < kind2)
9231 return -1;
9232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 len1 = PyUnicode_GET_LENGTH(s1);
9234 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009235 ADJUST_INDICES(start, end, len1);
9236 if (end - start < len2)
9237 return -1;
9238
9239 buf1 = PyUnicode_DATA(s1);
9240 buf2 = PyUnicode_DATA(s2);
9241 if (len2 == 1) {
9242 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9243 result = findchar((const char *)buf1 + kind1*start,
9244 kind1, end - start, ch, direction);
9245 if (result == -1)
9246 return -1;
9247 else
9248 return start + result;
9249 }
9250
9251 if (kind2 != kind1) {
9252 buf2 = _PyUnicode_AsKind(s2, kind1);
9253 if (!buf2)
9254 return -2;
9255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256
Victor Stinner794d5672011-10-10 03:21:36 +02009257 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009258 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009259 case PyUnicode_1BYTE_KIND:
9260 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9261 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9262 else
9263 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9264 break;
9265 case PyUnicode_2BYTE_KIND:
9266 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9267 break;
9268 case PyUnicode_4BYTE_KIND:
9269 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9270 break;
9271 default:
9272 assert(0); result = -2;
9273 }
9274 }
9275 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009276 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009277 case PyUnicode_1BYTE_KIND:
9278 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9279 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9280 else
9281 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9282 break;
9283 case PyUnicode_2BYTE_KIND:
9284 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_4BYTE_KIND:
9287 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 default:
9290 assert(0); result = -2;
9291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 }
9293
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009294 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 PyMem_Free(buf2);
9296
9297 return result;
9298}
9299
9300Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009301_PyUnicode_InsertThousandsGrouping(
9302 PyObject *unicode, Py_ssize_t index,
9303 Py_ssize_t n_buffer,
9304 void *digits, Py_ssize_t n_digits,
9305 Py_ssize_t min_width,
9306 const char *grouping, PyObject *thousands_sep,
9307 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308{
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009310 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 Py_ssize_t thousands_sep_len;
9312 Py_ssize_t len;
9313
9314 if (unicode != NULL) {
9315 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009316 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009317 }
9318 else {
9319 kind = PyUnicode_1BYTE_KIND;
9320 data = NULL;
9321 }
9322 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9323 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9324 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9325 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009326 if (thousands_sep_kind < kind) {
9327 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9328 if (!thousands_sep_data)
9329 return -1;
9330 }
9331 else {
9332 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9333 if (!data)
9334 return -1;
9335 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 }
9337
Benjamin Petersonead6b532011-12-20 17:23:42 -06009338 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009340 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009341 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009342 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009343 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009344 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009345 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009347 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009348 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009349 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009350 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009352 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009353 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009354 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009355 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009356 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009359 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009361 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 break;
9363 default:
9364 assert(0);
9365 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009367 if (unicode != NULL && thousands_sep_kind != kind) {
9368 if (thousands_sep_kind < kind)
9369 PyMem_Free(thousands_sep_data);
9370 else
9371 PyMem_Free(data);
9372 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 if (unicode == NULL) {
9374 *maxchar = 127;
9375 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009376 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009377 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 }
9379 }
9380 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381}
9382
9383
Alexander Belopolsky40018472011-02-26 01:02:56 +00009384Py_ssize_t
9385PyUnicode_Count(PyObject *str,
9386 PyObject *substr,
9387 Py_ssize_t start,
9388 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009390 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009391 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 void *buf1 = NULL, *buf2 = NULL;
9393 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009394
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009395 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009398 kind1 = PyUnicode_KIND(str);
9399 kind2 = PyUnicode_KIND(substr);
9400 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009401 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009402
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009403 len1 = PyUnicode_GET_LENGTH(str);
9404 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009406 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009407 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009408
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 buf1 = PyUnicode_DATA(str);
9410 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 if (!buf2)
9414 goto onError;
9415 }
9416
9417 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009420 result = asciilib_count(
9421 ((Py_UCS1*)buf1) + start, end - start,
9422 buf2, len2, PY_SSIZE_T_MAX
9423 );
9424 else
9425 result = ucs1lib_count(
9426 ((Py_UCS1*)buf1) + start, end - start,
9427 buf2, len2, PY_SSIZE_T_MAX
9428 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 break;
9430 case PyUnicode_2BYTE_KIND:
9431 result = ucs2lib_count(
9432 ((Py_UCS2*)buf1) + start, end - start,
9433 buf2, len2, PY_SSIZE_T_MAX
9434 );
9435 break;
9436 case PyUnicode_4BYTE_KIND:
9437 result = ucs4lib_count(
9438 ((Py_UCS4*)buf1) + start, end - start,
9439 buf2, len2, PY_SSIZE_T_MAX
9440 );
9441 break;
9442 default:
9443 assert(0); result = 0;
9444 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009445
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 PyMem_Free(buf2);
9448
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009451 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 PyMem_Free(buf2);
9453 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454}
9455
Alexander Belopolsky40018472011-02-26 01:02:56 +00009456Py_ssize_t
9457PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009458 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459 Py_ssize_t start,
9460 Py_ssize_t end,
9461 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009463 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009464 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009465
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009466 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467}
9468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469Py_ssize_t
9470PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9471 Py_ssize_t start, Py_ssize_t end,
9472 int direction)
9473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009475 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 if (PyUnicode_READY(str) == -1)
9477 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009478 len = PyUnicode_GET_LENGTH(str);
9479 ADJUST_INDICES(start, end, len);
9480 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009481 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009483 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9484 kind, end-start, ch, direction);
9485 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009487 else
9488 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489}
9490
Alexander Belopolsky40018472011-02-26 01:02:56 +00009491static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009492tailmatch(PyObject *self,
9493 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494 Py_ssize_t start,
9495 Py_ssize_t end,
9496 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 int kind_self;
9499 int kind_sub;
9500 void *data_self;
9501 void *data_sub;
9502 Py_ssize_t offset;
9503 Py_ssize_t i;
9504 Py_ssize_t end_sub;
9505
9506 if (PyUnicode_READY(self) == -1 ||
9507 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009508 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9511 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009515 if (PyUnicode_GET_LENGTH(substring) == 0)
9516 return 1;
9517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 kind_self = PyUnicode_KIND(self);
9519 data_self = PyUnicode_DATA(self);
9520 kind_sub = PyUnicode_KIND(substring);
9521 data_sub = PyUnicode_DATA(substring);
9522 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9523
9524 if (direction > 0)
9525 offset = end;
9526 else
9527 offset = start;
9528
9529 if (PyUnicode_READ(kind_self, data_self, offset) ==
9530 PyUnicode_READ(kind_sub, data_sub, 0) &&
9531 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9532 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9533 /* If both are of the same kind, memcmp is sufficient */
9534 if (kind_self == kind_sub) {
9535 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009536 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 data_sub,
9538 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009539 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009541 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 else {
9543 /* We do not need to compare 0 and len(substring)-1 because
9544 the if statement above ensured already that they are equal
9545 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 for (i = 1; i < end_sub; ++i) {
9547 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9548 PyUnicode_READ(kind_sub, data_sub, i))
9549 return 0;
9550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553 }
9554
9555 return 0;
9556}
9557
Alexander Belopolsky40018472011-02-26 01:02:56 +00009558Py_ssize_t
9559PyUnicode_Tailmatch(PyObject *str,
9560 PyObject *substr,
9561 Py_ssize_t start,
9562 Py_ssize_t end,
9563 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009565 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009567
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009568 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569}
9570
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571/* Apply fixfct filter to the Unicode object self and return a
9572 reference to the modified object */
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009575fixup(PyObject *self,
9576 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 PyObject *u;
9579 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009580 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009582 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009585 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587 /* fix functions return the new maximum character in a string,
9588 if the kind of the resulting unicode object does not change,
9589 everything is fine. Otherwise we need to change the string kind
9590 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009591 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009592
9593 if (maxchar_new == 0) {
9594 /* no changes */;
9595 if (PyUnicode_CheckExact(self)) {
9596 Py_DECREF(u);
9597 Py_INCREF(self);
9598 return self;
9599 }
9600 else
9601 return u;
9602 }
9603
Victor Stinnere6abb482012-05-02 01:15:40 +02009604 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605
Victor Stinnereaab6042011-12-11 22:22:39 +01009606 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009608
9609 /* In case the maximum character changed, we need to
9610 convert the string to the new category. */
9611 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9612 if (v == NULL) {
9613 Py_DECREF(u);
9614 return NULL;
9615 }
9616 if (maxchar_new > maxchar_old) {
9617 /* If the maxchar increased so that the kind changed, not all
9618 characters are representable anymore and we need to fix the
9619 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009620 _PyUnicode_FastCopyCharacters(v, 0,
9621 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009622 maxchar_old = fixfct(v);
9623 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 }
9625 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009626 _PyUnicode_FastCopyCharacters(v, 0,
9627 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009629 Py_DECREF(u);
9630 assert(_PyUnicode_CheckConsistency(v, 1));
9631 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632}
9633
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009634static PyObject *
9635ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9638 char *resdata, *data = PyUnicode_DATA(self);
9639 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009640
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641 res = PyUnicode_New(len, 127);
9642 if (res == NULL)
9643 return NULL;
9644 resdata = PyUnicode_DATA(res);
9645 if (lower)
9646 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648 _Py_bytes_upper(resdata, data, len);
9649 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650}
9651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 Py_ssize_t j;
9656 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009657 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9661
9662 where ! is a negation and \p{xxx} is a character with property xxx.
9663 */
9664 for (j = i - 1; j >= 0; j--) {
9665 c = PyUnicode_READ(kind, data, j);
9666 if (!_PyUnicode_IsCaseIgnorable(c))
9667 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9670 if (final_sigma) {
9671 for (j = i + 1; j < length; j++) {
9672 c = PyUnicode_READ(kind, data, j);
9673 if (!_PyUnicode_IsCaseIgnorable(c))
9674 break;
9675 }
9676 final_sigma = j == length || !_PyUnicode_IsCased(c);
9677 }
9678 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679}
9680
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681static int
9682lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9683 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 /* Obscure special case. */
9686 if (c == 0x3A3) {
9687 mapped[0] = handle_capital_sigma(kind, data, length, i);
9688 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693static Py_ssize_t
9694do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 Py_ssize_t i, k = 0;
9697 int n_res, j;
9698 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009699
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 c = PyUnicode_READ(kind, data, 0);
9701 n_res = _PyUnicode_ToUpperFull(c, mapped);
9702 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009703 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 for (i = 1; i < length; i++) {
9707 c = PyUnicode_READ(kind, data, i);
9708 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9709 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009710 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009712 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009713 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715}
9716
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717static Py_ssize_t
9718do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9719 Py_ssize_t i, k = 0;
9720
9721 for (i = 0; i < length; i++) {
9722 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9723 int n_res, j;
9724 if (Py_UNICODE_ISUPPER(c)) {
9725 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9726 }
9727 else if (Py_UNICODE_ISLOWER(c)) {
9728 n_res = _PyUnicode_ToUpperFull(c, mapped);
9729 }
9730 else {
9731 n_res = 1;
9732 mapped[0] = c;
9733 }
9734 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009735 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 res[k++] = mapped[j];
9737 }
9738 }
9739 return k;
9740}
9741
9742static Py_ssize_t
9743do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9744 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746 Py_ssize_t i, k = 0;
9747
9748 for (i = 0; i < length; i++) {
9749 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750 int n_res, j;
9751 if (lower)
9752 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753 else
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 res[k++] = mapped[j];
9758 }
9759 }
9760 return k;
9761}
9762
9763static Py_ssize_t
9764do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9765{
9766 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9767}
9768
9769static Py_ssize_t
9770do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9771{
9772 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9773}
9774
Benjamin Petersone51757f2012-01-12 21:10:29 -05009775static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009776do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9777{
9778 Py_ssize_t i, k = 0;
9779
9780 for (i = 0; i < length; i++) {
9781 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9782 Py_UCS4 mapped[3];
9783 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9784 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009785 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009786 res[k++] = mapped[j];
9787 }
9788 }
9789 return k;
9790}
9791
9792static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009793do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9794{
9795 Py_ssize_t i, k = 0;
9796 int previous_is_cased;
9797
9798 previous_is_cased = 0;
9799 for (i = 0; i < length; i++) {
9800 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9801 Py_UCS4 mapped[3];
9802 int n_res, j;
9803
9804 if (previous_is_cased)
9805 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9806 else
9807 n_res = _PyUnicode_ToTitleFull(c, mapped);
9808
9809 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009810 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009811 res[k++] = mapped[j];
9812 }
9813
9814 previous_is_cased = _PyUnicode_IsCased(c);
9815 }
9816 return k;
9817}
9818
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819static PyObject *
9820case_operation(PyObject *self,
9821 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9822{
9823 PyObject *res = NULL;
9824 Py_ssize_t length, newlength = 0;
9825 int kind, outkind;
9826 void *data, *outdata;
9827 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9828
Benjamin Petersoneea48462012-01-16 14:28:50 -05009829 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830
9831 kind = PyUnicode_KIND(self);
9832 data = PyUnicode_DATA(self);
9833 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009834 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009835 PyErr_SetString(PyExc_OverflowError, "string is too long");
9836 return NULL;
9837 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009838 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839 if (tmp == NULL)
9840 return PyErr_NoMemory();
9841 newlength = perform(kind, data, length, tmp, &maxchar);
9842 res = PyUnicode_New(newlength, maxchar);
9843 if (res == NULL)
9844 goto leave;
9845 tmpend = tmp + newlength;
9846 outdata = PyUnicode_DATA(res);
9847 outkind = PyUnicode_KIND(res);
9848 switch (outkind) {
9849 case PyUnicode_1BYTE_KIND:
9850 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9851 break;
9852 case PyUnicode_2BYTE_KIND:
9853 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9854 break;
9855 case PyUnicode_4BYTE_KIND:
9856 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9857 break;
9858 default:
9859 assert(0);
9860 break;
9861 }
9862 leave:
9863 PyMem_FREE(tmp);
9864 return res;
9865}
9866
Tim Peters8ce9f162004-08-27 01:49:32 +00009867PyObject *
9868PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009870 PyObject *res;
9871 PyObject *fseq;
9872 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009873 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009875 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009876 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009878 }
9879
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009880 /* NOTE: the following code can't call back into Python code,
9881 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009882 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009884 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009885 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009886 res = _PyUnicode_JoinArray(separator, items, seqlen);
9887 Py_DECREF(fseq);
9888 return res;
9889}
9890
9891PyObject *
9892_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9893{
9894 PyObject *res = NULL; /* the result */
9895 PyObject *sep = NULL;
9896 Py_ssize_t seplen;
9897 PyObject *item;
9898 Py_ssize_t sz, i, res_offset;
9899 Py_UCS4 maxchar;
9900 Py_UCS4 item_maxchar;
9901 int use_memcpy;
9902 unsigned char *res_data = NULL, *sep_data = NULL;
9903 PyObject *last_obj;
9904 unsigned int kind = 0;
9905
Tim Peters05eba1f2004-08-27 21:32:02 +00009906 /* If empty sequence, return u"". */
9907 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009908 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009910
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009912 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009913 if (seqlen == 1) {
9914 if (PyUnicode_CheckExact(items[0])) {
9915 res = items[0];
9916 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009917 return res;
9918 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009919 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009920 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009921 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009922 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009923 /* Set up sep and seplen */
9924 if (separator == NULL) {
9925 /* fall back to a blank space separator */
9926 sep = PyUnicode_FromOrdinal(' ');
9927 if (!sep)
9928 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009929 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009930 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009932 else {
9933 if (!PyUnicode_Check(separator)) {
9934 PyErr_Format(PyExc_TypeError,
9935 "separator: expected str instance,"
9936 " %.80s found",
9937 Py_TYPE(separator)->tp_name);
9938 goto onError;
9939 }
9940 if (PyUnicode_READY(separator))
9941 goto onError;
9942 sep = separator;
9943 seplen = PyUnicode_GET_LENGTH(separator);
9944 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9945 /* inc refcount to keep this code path symmetric with the
9946 above case of a blank separator */
9947 Py_INCREF(sep);
9948 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009949 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 }
9951
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 /* There are at least two things to join, or else we have a subclass
9953 * of str in the sequence.
9954 * Do a pre-pass to figure out the total amount of space we'll
9955 * need (sz), and see whether all argument are strings.
9956 */
9957 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009958#ifdef Py_DEBUG
9959 use_memcpy = 0;
9960#else
9961 use_memcpy = 1;
9962#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009963 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009964 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009965 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 if (!PyUnicode_Check(item)) {
9967 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009968 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 " %.80s found",
9970 i, Py_TYPE(item)->tp_name);
9971 goto onError;
9972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (PyUnicode_READY(item) == -1)
9974 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009975 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009977 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009978 if (i != 0) {
9979 add_sz += seplen;
9980 }
9981 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009982 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 goto onError;
9985 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009986 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009987 if (use_memcpy && last_obj != NULL) {
9988 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9989 use_memcpy = 0;
9990 }
9991 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009992 }
Tim Petersced69f82003-09-16 20:30:58 +00009993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 if (res == NULL)
9996 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009997
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009999#ifdef Py_DEBUG
10000 use_memcpy = 0;
10001#else
10002 if (use_memcpy) {
10003 res_data = PyUnicode_1BYTE_DATA(res);
10004 kind = PyUnicode_KIND(res);
10005 if (seplen != 0)
10006 sep_data = PyUnicode_1BYTE_DATA(sep);
10007 }
10008#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010009 if (use_memcpy) {
10010 for (i = 0; i < seqlen; ++i) {
10011 Py_ssize_t itemlen;
10012 item = items[i];
10013
10014 /* Copy item, and maybe the separator. */
10015 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010016 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010017 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010018 kind * seplen);
10019 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010021
10022 itemlen = PyUnicode_GET_LENGTH(item);
10023 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010024 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010025 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010026 kind * itemlen);
10027 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010029 }
10030 assert(res_data == PyUnicode_1BYTE_DATA(res)
10031 + kind * PyUnicode_GET_LENGTH(res));
10032 }
10033 else {
10034 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10035 Py_ssize_t itemlen;
10036 item = items[i];
10037
10038 /* Copy item, and maybe the separator. */
10039 if (i && seplen != 0) {
10040 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10041 res_offset += seplen;
10042 }
10043
10044 itemlen = PyUnicode_GET_LENGTH(item);
10045 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010046 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 res_offset += itemlen;
10048 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010049 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010051 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010054 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010059 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 return NULL;
10061}
10062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063#define FILL(kind, data, value, start, length) \
10064 do { \
10065 Py_ssize_t i_ = 0; \
10066 assert(kind != PyUnicode_WCHAR_KIND); \
10067 switch ((kind)) { \
10068 case PyUnicode_1BYTE_KIND: { \
10069 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010070 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 break; \
10072 } \
10073 case PyUnicode_2BYTE_KIND: { \
10074 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10075 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10076 break; \
10077 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010078 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10080 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10081 break; \
10082 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010083 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 } \
10085 } while (0)
10086
Victor Stinnerd3f08822012-05-29 12:57:52 +020010087void
10088_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10089 Py_UCS4 fill_char)
10090{
10091 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10092 const void *data = PyUnicode_DATA(unicode);
10093 assert(PyUnicode_IS_READY(unicode));
10094 assert(unicode_modifiable(unicode));
10095 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10096 assert(start >= 0);
10097 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10098 FILL(kind, data, fill_char, start, length);
10099}
10100
Victor Stinner3fe55312012-01-04 00:33:50 +010010101Py_ssize_t
10102PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10103 Py_UCS4 fill_char)
10104{
10105 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010106
10107 if (!PyUnicode_Check(unicode)) {
10108 PyErr_BadInternalCall();
10109 return -1;
10110 }
10111 if (PyUnicode_READY(unicode) == -1)
10112 return -1;
10113 if (unicode_check_modifiable(unicode))
10114 return -1;
10115
Victor Stinnerd3f08822012-05-29 12:57:52 +020010116 if (start < 0) {
10117 PyErr_SetString(PyExc_IndexError, "string index out of range");
10118 return -1;
10119 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010120 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10121 PyErr_SetString(PyExc_ValueError,
10122 "fill character is bigger than "
10123 "the string maximum character");
10124 return -1;
10125 }
10126
10127 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10128 length = Py_MIN(maxlen, length);
10129 if (length <= 0)
10130 return 0;
10131
Victor Stinnerd3f08822012-05-29 12:57:52 +020010132 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010133 return length;
10134}
10135
Victor Stinner9310abb2011-10-05 00:59:23 +020010136static PyObject *
10137pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010138 Py_ssize_t left,
10139 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 PyObject *u;
10143 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010144 int kind;
10145 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
10147 if (left < 0)
10148 left = 0;
10149 if (right < 0)
10150 right = 0;
10151
Victor Stinnerc4b49542011-12-11 22:44:26 +010010152 if (left == 0 && right == 0)
10153 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10156 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010157 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10158 return NULL;
10159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010161 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010163 if (!u)
10164 return NULL;
10165
10166 kind = PyUnicode_KIND(u);
10167 data = PyUnicode_DATA(u);
10168 if (left)
10169 FILL(kind, data, fill, 0, left);
10170 if (right)
10171 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010172 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010173 assert(_PyUnicode_CheckConsistency(u, 1));
10174 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175}
10176
Alexander Belopolsky40018472011-02-26 01:02:56 +000010177PyObject *
10178PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010182 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
Benjamin Petersonead6b532011-12-20 17:23:42 -060010185 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 if (PyUnicode_IS_ASCII(string))
10188 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(string), keepends);
10191 else
10192 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010194 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 break;
10196 case PyUnicode_2BYTE_KIND:
10197 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 PyUnicode_GET_LENGTH(string), keepends);
10200 break;
10201 case PyUnicode_4BYTE_KIND:
10202 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 break;
10206 default:
10207 assert(0);
10208 list = 0;
10209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211}
10212
Alexander Belopolsky40018472011-02-26 01:02:56 +000010213static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010214split(PyObject *self,
10215 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010216 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010218 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 void *buf1, *buf2;
10220 Py_ssize_t len1, len2;
10221 PyObject* out;
10222
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010224 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (PyUnicode_READY(self) == -1)
10227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010230 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 if (PyUnicode_IS_ASCII(self))
10233 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 PyUnicode_GET_LENGTH(self), maxcount
10236 );
10237 else
10238 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 case PyUnicode_2BYTE_KIND:
10243 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
10247 case PyUnicode_4BYTE_KIND:
10248 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 default:
10253 assert(0);
10254 return NULL;
10255 }
10256
10257 if (PyUnicode_READY(substring) == -1)
10258 return NULL;
10259
10260 kind1 = PyUnicode_KIND(self);
10261 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 len1 = PyUnicode_GET_LENGTH(self);
10263 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010264 if (kind1 < kind2 || len1 < len2) {
10265 out = PyList_New(1);
10266 if (out == NULL)
10267 return NULL;
10268 Py_INCREF(self);
10269 PyList_SET_ITEM(out, 0, self);
10270 return out;
10271 }
10272 buf1 = PyUnicode_DATA(self);
10273 buf2 = PyUnicode_DATA(substring);
10274 if (kind2 != kind1) {
10275 buf2 = _PyUnicode_AsKind(substring, kind1);
10276 if (!buf2)
10277 return NULL;
10278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010280 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010282 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10283 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 else
10286 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 break;
10289 case PyUnicode_2BYTE_KIND:
10290 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 case PyUnicode_4BYTE_KIND:
10294 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 break;
10297 default:
10298 out = NULL;
10299 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010300 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyMem_Free(buf2);
10302 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Alexander Belopolsky40018472011-02-26 01:02:56 +000010305static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010306rsplit(PyObject *self,
10307 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010309{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010310 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 void *buf1, *buf2;
10312 Py_ssize_t len1, len2;
10313 PyObject* out;
10314
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010315 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010316 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (PyUnicode_READY(self) == -1)
10319 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010322 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 if (PyUnicode_IS_ASCII(self))
10325 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 PyUnicode_GET_LENGTH(self), maxcount
10328 );
10329 else
10330 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 case PyUnicode_2BYTE_KIND:
10335 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 default:
10345 assert(0);
10346 return NULL;
10347 }
10348
10349 if (PyUnicode_READY(substring) == -1)
10350 return NULL;
10351
10352 kind1 = PyUnicode_KIND(self);
10353 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 len1 = PyUnicode_GET_LENGTH(self);
10355 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010356 if (kind1 < kind2 || len1 < len2) {
10357 out = PyList_New(1);
10358 if (out == NULL)
10359 return NULL;
10360 Py_INCREF(self);
10361 PyList_SET_ITEM(out, 0, self);
10362 return out;
10363 }
10364 buf1 = PyUnicode_DATA(self);
10365 buf2 = PyUnicode_DATA(substring);
10366 if (kind2 != kind1) {
10367 buf2 = _PyUnicode_AsKind(substring, kind1);
10368 if (!buf2)
10369 return NULL;
10370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010372 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10375 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 else
10378 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 break;
10381 case PyUnicode_2BYTE_KIND:
10382 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 break;
10385 case PyUnicode_4BYTE_KIND:
10386 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010387 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 break;
10389 default:
10390 out = NULL;
10391 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010392 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 PyMem_Free(buf2);
10394 return out;
10395}
10396
10397static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10399 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010401 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10404 return asciilib_find(buf1, len1, buf2, len2, offset);
10405 else
10406 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 case PyUnicode_2BYTE_KIND:
10408 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10409 case PyUnicode_4BYTE_KIND:
10410 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10411 }
10412 assert(0);
10413 return -1;
10414}
10415
10416static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10418 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010420 switch (kind) {
10421 case PyUnicode_1BYTE_KIND:
10422 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10423 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10424 else
10425 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10426 case PyUnicode_2BYTE_KIND:
10427 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10428 case PyUnicode_4BYTE_KIND:
10429 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10430 }
10431 assert(0);
10432 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010433}
10434
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010435static void
10436replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10437 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10438{
10439 int kind = PyUnicode_KIND(u);
10440 void *data = PyUnicode_DATA(u);
10441 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10442 if (kind == PyUnicode_1BYTE_KIND) {
10443 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10444 (Py_UCS1 *)data + len,
10445 u1, u2, maxcount);
10446 }
10447 else if (kind == PyUnicode_2BYTE_KIND) {
10448 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10449 (Py_UCS2 *)data + len,
10450 u1, u2, maxcount);
10451 }
10452 else {
10453 assert(kind == PyUnicode_4BYTE_KIND);
10454 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10455 (Py_UCS4 *)data + len,
10456 u1, u2, maxcount);
10457 }
10458}
10459
Alexander Belopolsky40018472011-02-26 01:02:56 +000010460static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461replace(PyObject *self, PyObject *str1,
10462 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 PyObject *u;
10465 char *sbuf = PyUnicode_DATA(self);
10466 char *buf1 = PyUnicode_DATA(str1);
10467 char *buf2 = PyUnicode_DATA(str2);
10468 int srelease = 0, release1 = 0, release2 = 0;
10469 int skind = PyUnicode_KIND(self);
10470 int kind1 = PyUnicode_KIND(str1);
10471 int kind2 = PyUnicode_KIND(str2);
10472 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10473 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10474 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010475 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010476 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
10478 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010481 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
Victor Stinner59de0ee2011-10-07 10:01:28 +020010483 if (str1 == str2)
10484 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485
Victor Stinner49a0a212011-10-12 23:46:10 +020010486 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010487 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10488 if (maxchar < maxchar_str1)
10489 /* substring too wide to be present */
10490 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010491 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10492 /* Replacing str1 with str2 may cause a maxchar reduction in the
10493 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010494 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010495 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010500 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010503 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010504 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010505
Victor Stinner69ed0f42013-04-09 21:48:24 +020010506 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010507 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010508 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010509 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010510 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010514
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010515 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10516 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 }
10518 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 int rkind = skind;
10520 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010521 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (kind1 < rkind) {
10524 /* widen substring */
10525 buf1 = _PyUnicode_AsKind(str1, rkind);
10526 if (!buf1) goto error;
10527 release1 = 1;
10528 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010529 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 if (i < 0)
10531 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (rkind > kind2) {
10533 /* widen replacement */
10534 buf2 = _PyUnicode_AsKind(str2, rkind);
10535 if (!buf2) goto error;
10536 release2 = 1;
10537 }
10538 else if (rkind < kind2) {
10539 /* widen self and buf1 */
10540 rkind = kind2;
10541 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010542 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 sbuf = _PyUnicode_AsKind(self, rkind);
10544 if (!sbuf) goto error;
10545 srelease = 1;
10546 buf1 = _PyUnicode_AsKind(str1, rkind);
10547 if (!buf1) goto error;
10548 release1 = 1;
10549 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 u = PyUnicode_New(slen, maxchar);
10551 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010553 assert(PyUnicode_KIND(u) == rkind);
10554 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010555
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010557 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010560 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010562
10563 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010564 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010565 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010566 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010567 if (i == -1)
10568 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010569 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010571 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 }
10576 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010578 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 int rkind = skind;
10580 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf1 = _PyUnicode_AsKind(str1, rkind);
10585 if (!buf1) goto error;
10586 release1 = 1;
10587 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589 if (n == 0)
10590 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010592 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 buf2 = _PyUnicode_AsKind(str2, rkind);
10594 if (!buf2) goto error;
10595 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 rkind = kind2;
10600 sbuf = _PyUnicode_AsKind(self, rkind);
10601 if (!sbuf) goto error;
10602 srelease = 1;
10603 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010604 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
10609 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10610 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010611 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 PyErr_SetString(PyExc_OverflowError,
10613 "replace string is too long");
10614 goto error;
10615 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010616 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010618 _Py_INCREF_UNICODE_EMPTY();
10619 if (!unicode_empty)
10620 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 u = unicode_empty;
10622 goto done;
10623 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010624 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 PyErr_SetString(PyExc_OverflowError,
10626 "replace string is too long");
10627 goto error;
10628 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010629 u = PyUnicode_New(new_size, maxchar);
10630 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 assert(PyUnicode_KIND(u) == rkind);
10633 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 ires = i = 0;
10635 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 while (n-- > 0) {
10637 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010638 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010640 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010641 if (j == -1)
10642 break;
10643 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
10646 sbuf + rkind * i,
10647 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 }
10650 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 memcpy(res + rkind * ires,
10662 sbuf + rkind * i,
10663 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010664 }
10665 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 /* interleave */
10667 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010670 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 if (--n <= 0)
10673 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 memcpy(res + rkind * ires,
10675 sbuf + rkind * i,
10676 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 ires++;
10678 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010680 memcpy(res + rkind * ires,
10681 sbuf + rkind * i,
10682 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010684 }
10685
10686 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010687 unicode_adjust_maxchar(&u);
10688 if (u == NULL)
10689 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010691
10692 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (srelease)
10694 PyMem_FREE(sbuf);
10695 if (release1)
10696 PyMem_FREE(buf1);
10697 if (release2)
10698 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010699 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010701
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 if (srelease)
10705 PyMem_FREE(sbuf);
10706 if (release1)
10707 PyMem_FREE(buf1);
10708 if (release2)
10709 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010710 return unicode_result_unchanged(self);
10711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 error:
10713 if (srelease && sbuf)
10714 PyMem_FREE(sbuf);
10715 if (release1 && buf1)
10716 PyMem_FREE(buf1);
10717 if (release2 && buf2)
10718 PyMem_FREE(buf2);
10719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720}
10721
10722/* --- Unicode Object Methods --------------------------------------------- */
10723
INADA Naoki3ae20562017-01-16 20:41:20 +090010724/*[clinic input]
10725str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
INADA Naoki3ae20562017-01-16 20:41:20 +090010727Return a version of the string where each word is titlecased.
10728
10729More specifically, words start with uppercased characters and all remaining
10730cased characters have lower case.
10731[clinic start generated code]*/
10732
10733static PyObject *
10734unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010735/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010737 if (PyUnicode_READY(self) == -1)
10738 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010739 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740}
10741
INADA Naoki3ae20562017-01-16 20:41:20 +090010742/*[clinic input]
10743str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745Return a capitalized version of the string.
10746
10747More specifically, make the first character have upper case and the rest lower
10748case.
10749[clinic start generated code]*/
10750
10751static PyObject *
10752unicode_capitalize_impl(PyObject *self)
10753/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010755 if (PyUnicode_READY(self) == -1)
10756 return NULL;
10757 if (PyUnicode_GET_LENGTH(self) == 0)
10758 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010759 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760}
10761
INADA Naoki3ae20562017-01-16 20:41:20 +090010762/*[clinic input]
10763str.casefold as unicode_casefold
10764
10765Return a version of the string suitable for caseless comparisons.
10766[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010767
10768static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010769unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010770/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010771{
10772 if (PyUnicode_READY(self) == -1)
10773 return NULL;
10774 if (PyUnicode_IS_ASCII(self))
10775 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010776 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010777}
10778
10779
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010780/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010781
10782static int
10783convert_uc(PyObject *obj, void *addr)
10784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010787 if (!PyUnicode_Check(obj)) {
10788 PyErr_Format(PyExc_TypeError,
10789 "The fill character must be a unicode character, "
10790 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 return 0;
10792 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010793 if (PyUnicode_READY(obj) < 0)
10794 return 0;
10795 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010796 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010798 return 0;
10799 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010800 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010802}
10803
INADA Naoki3ae20562017-01-16 20:41:20 +090010804/*[clinic input]
10805str.center as unicode_center
10806
10807 width: Py_ssize_t
10808 fillchar: Py_UCS4 = ' '
10809 /
10810
10811Return a centered string of length width.
10812
10813Padding is done using the specified fill character (default is a space).
10814[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815
10816static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010817unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10818/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010820 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
Benjamin Petersonbac79492012-01-14 13:34:47 -050010822 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 return NULL;
10824
Victor Stinnerc4b49542011-12-11 22:44:26 +010010825 if (PyUnicode_GET_LENGTH(self) >= width)
10826 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Victor Stinnerc4b49542011-12-11 22:44:26 +010010828 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 left = marg / 2 + (marg & width & 1);
10830
Victor Stinner9310abb2011-10-05 00:59:23 +020010831 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832}
10833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834/* This function assumes that str1 and str2 are readied by the caller. */
10835
Marc-André Lemburge5034372000-08-08 08:04:29 +000010836static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010837unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010838{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010839#define COMPARE(TYPE1, TYPE2) \
10840 do { \
10841 TYPE1* p1 = (TYPE1 *)data1; \
10842 TYPE2* p2 = (TYPE2 *)data2; \
10843 TYPE1* end = p1 + len; \
10844 Py_UCS4 c1, c2; \
10845 for (; p1 != end; p1++, p2++) { \
10846 c1 = *p1; \
10847 c2 = *p2; \
10848 if (c1 != c2) \
10849 return (c1 < c2) ? -1 : 1; \
10850 } \
10851 } \
10852 while (0)
10853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 int kind1, kind2;
10855 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 kind1 = PyUnicode_KIND(str1);
10859 kind2 = PyUnicode_KIND(str2);
10860 data1 = PyUnicode_DATA(str1);
10861 data2 = PyUnicode_DATA(str2);
10862 len1 = PyUnicode_GET_LENGTH(str1);
10863 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010864 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010865
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010866 switch(kind1) {
10867 case PyUnicode_1BYTE_KIND:
10868 {
10869 switch(kind2) {
10870 case PyUnicode_1BYTE_KIND:
10871 {
10872 int cmp = memcmp(data1, data2, len);
10873 /* normalize result of memcmp() into the range [-1; 1] */
10874 if (cmp < 0)
10875 return -1;
10876 if (cmp > 0)
10877 return 1;
10878 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010879 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 case PyUnicode_2BYTE_KIND:
10881 COMPARE(Py_UCS1, Py_UCS2);
10882 break;
10883 case PyUnicode_4BYTE_KIND:
10884 COMPARE(Py_UCS1, Py_UCS4);
10885 break;
10886 default:
10887 assert(0);
10888 }
10889 break;
10890 }
10891 case PyUnicode_2BYTE_KIND:
10892 {
10893 switch(kind2) {
10894 case PyUnicode_1BYTE_KIND:
10895 COMPARE(Py_UCS2, Py_UCS1);
10896 break;
10897 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010898 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010899 COMPARE(Py_UCS2, Py_UCS2);
10900 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010901 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 case PyUnicode_4BYTE_KIND:
10903 COMPARE(Py_UCS2, Py_UCS4);
10904 break;
10905 default:
10906 assert(0);
10907 }
10908 break;
10909 }
10910 case PyUnicode_4BYTE_KIND:
10911 {
10912 switch(kind2) {
10913 case PyUnicode_1BYTE_KIND:
10914 COMPARE(Py_UCS4, Py_UCS1);
10915 break;
10916 case PyUnicode_2BYTE_KIND:
10917 COMPARE(Py_UCS4, Py_UCS2);
10918 break;
10919 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010920 {
10921#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10922 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10923 /* normalize result of wmemcmp() into the range [-1; 1] */
10924 if (cmp < 0)
10925 return -1;
10926 if (cmp > 0)
10927 return 1;
10928#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010929 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010930#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010931 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010932 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933 default:
10934 assert(0);
10935 }
10936 break;
10937 }
10938 default:
10939 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010940 }
10941
Victor Stinner770e19e2012-10-04 22:59:45 +020010942 if (len1 == len2)
10943 return 0;
10944 if (len1 < len2)
10945 return -1;
10946 else
10947 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010948
10949#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010950}
10951
Benjamin Peterson621b4302016-09-09 13:54:34 -070010952static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010953unicode_compare_eq(PyObject *str1, PyObject *str2)
10954{
10955 int kind;
10956 void *data1, *data2;
10957 Py_ssize_t len;
10958 int cmp;
10959
Victor Stinnere5567ad2012-10-23 02:48:49 +020010960 len = PyUnicode_GET_LENGTH(str1);
10961 if (PyUnicode_GET_LENGTH(str2) != len)
10962 return 0;
10963 kind = PyUnicode_KIND(str1);
10964 if (PyUnicode_KIND(str2) != kind)
10965 return 0;
10966 data1 = PyUnicode_DATA(str1);
10967 data2 = PyUnicode_DATA(str2);
10968
10969 cmp = memcmp(data1, data2, len * kind);
10970 return (cmp == 0);
10971}
10972
10973
Alexander Belopolsky40018472011-02-26 01:02:56 +000010974int
10975PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10978 if (PyUnicode_READY(left) == -1 ||
10979 PyUnicode_READY(right) == -1)
10980 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010981
10982 /* a string is equal to itself */
10983 if (left == right)
10984 return 0;
10985
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010986 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010988 PyErr_Format(PyExc_TypeError,
10989 "Can't compare %.100s and %.100s",
10990 left->ob_type->tp_name,
10991 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 return -1;
10993}
10994
Martin v. Löwis5b222132007-06-10 09:51:05 +000010995int
10996PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 Py_ssize_t i;
10999 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011001 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002
Victor Stinner910337b2011-10-03 03:20:16 +020011003 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011004 if (!PyUnicode_IS_READY(uni)) {
11005 const wchar_t *ws = _PyUnicode_WSTR(uni);
11006 /* Compare Unicode string and source character set string */
11007 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11008 if (chr != ustr[i])
11009 return (chr < ustr[i]) ? -1 : 1;
11010 }
11011 /* This check keeps Python strings that end in '\0' from comparing equal
11012 to C strings identical up to that point. */
11013 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11014 return 1; /* uni is longer */
11015 if (ustr[i])
11016 return -1; /* str is longer */
11017 return 0;
11018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011020 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011021 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011022 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011023 size_t len, len2 = strlen(str);
11024 int cmp;
11025
11026 len = Py_MIN(len1, len2);
11027 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011028 if (cmp != 0) {
11029 if (cmp < 0)
11030 return -1;
11031 else
11032 return 1;
11033 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 if (len1 > len2)
11035 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011036 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 return -1; /* str is longer */
11038 return 0;
11039 }
11040 else {
11041 void *data = PyUnicode_DATA(uni);
11042 /* Compare Unicode string and source character set string */
11043 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011044 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011045 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11046 /* This check keeps Python strings that end in '\0' from comparing equal
11047 to C strings identical up to that point. */
11048 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11049 return 1; /* uni is longer */
11050 if (str[i])
11051 return -1; /* str is longer */
11052 return 0;
11053 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011054}
11055
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011056static int
11057non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11058{
11059 size_t i, len;
11060 const wchar_t *p;
11061 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11062 if (strlen(str) != len)
11063 return 0;
11064 p = _PyUnicode_WSTR(unicode);
11065 assert(p);
11066 for (i = 0; i < len; i++) {
11067 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011068 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011069 return 0;
11070 }
11071 return 1;
11072}
11073
11074int
11075_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11076{
11077 size_t len;
11078 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011079 assert(str);
11080#ifndef NDEBUG
11081 for (const char *p = str; *p; p++) {
11082 assert((unsigned char)*p < 128);
11083 }
11084#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011085 if (PyUnicode_READY(unicode) == -1) {
11086 /* Memory error or bad data */
11087 PyErr_Clear();
11088 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11089 }
11090 if (!PyUnicode_IS_ASCII(unicode))
11091 return 0;
11092 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11093 return strlen(str) == len &&
11094 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11095}
11096
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011097int
11098_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11099{
11100 PyObject *right_uni;
11101 Py_hash_t hash;
11102
11103 assert(_PyUnicode_CHECK(left));
11104 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011105#ifndef NDEBUG
11106 for (const char *p = right->string; *p; p++) {
11107 assert((unsigned char)*p < 128);
11108 }
11109#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011110
11111 if (PyUnicode_READY(left) == -1) {
11112 /* memory error or bad data */
11113 PyErr_Clear();
11114 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11115 }
11116
11117 if (!PyUnicode_IS_ASCII(left))
11118 return 0;
11119
11120 right_uni = _PyUnicode_FromId(right); /* borrowed */
11121 if (right_uni == NULL) {
11122 /* memory error or bad data */
11123 PyErr_Clear();
11124 return _PyUnicode_EqualToASCIIString(left, right->string);
11125 }
11126
11127 if (left == right_uni)
11128 return 1;
11129
11130 if (PyUnicode_CHECK_INTERNED(left))
11131 return 0;
11132
11133 assert(_PyUnicode_HASH(right_uni) != 1);
11134 hash = _PyUnicode_HASH(left);
11135 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11136 return 0;
11137
11138 return unicode_compare_eq(left, right_uni);
11139}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011140
Benjamin Peterson29060642009-01-31 22:14:21 +000011141#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011142 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011143
Alexander Belopolsky40018472011-02-26 01:02:56 +000011144PyObject *
11145PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011146{
11147 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011148 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011149
Victor Stinnere5567ad2012-10-23 02:48:49 +020011150 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11151 Py_RETURN_NOTIMPLEMENTED;
11152
11153 if (PyUnicode_READY(left) == -1 ||
11154 PyUnicode_READY(right) == -1)
11155 return NULL;
11156
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011157 if (left == right) {
11158 switch (op) {
11159 case Py_EQ:
11160 case Py_LE:
11161 case Py_GE:
11162 /* a string is equal to itself */
11163 v = Py_True;
11164 break;
11165 case Py_NE:
11166 case Py_LT:
11167 case Py_GT:
11168 v = Py_False;
11169 break;
11170 default:
11171 PyErr_BadArgument();
11172 return NULL;
11173 }
11174 }
11175 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011176 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011177 result ^= (op == Py_NE);
11178 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011179 }
11180 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011181 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011182
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011183 /* Convert the return value to a Boolean */
11184 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011185 case Py_LE:
11186 v = TEST_COND(result <= 0);
11187 break;
11188 case Py_GE:
11189 v = TEST_COND(result >= 0);
11190 break;
11191 case Py_LT:
11192 v = TEST_COND(result == -1);
11193 break;
11194 case Py_GT:
11195 v = TEST_COND(result == 1);
11196 break;
11197 default:
11198 PyErr_BadArgument();
11199 return NULL;
11200 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011201 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011202 Py_INCREF(v);
11203 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011204}
11205
Alexander Belopolsky40018472011-02-26 01:02:56 +000011206int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011207_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11208{
11209 return unicode_eq(aa, bb);
11210}
11211
11212int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214{
Victor Stinner77282cb2013-04-14 19:22:47 +020011215 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 void *buf1, *buf2;
11217 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011218 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011219
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 "'in <string>' requires string as left operand, not %.100s",
11223 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011225 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011226 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 if (ensure_unicode(str) < 0)
11229 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 kind2 = PyUnicode_KIND(substr);
11233 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 len2 = PyUnicode_GET_LENGTH(substr);
11237 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 if (len2 == 1) {
11242 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11243 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 return result;
11245 }
11246 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 buf2 = _PyUnicode_AsKind(substr, kind1);
11248 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251
Victor Stinner77282cb2013-04-14 19:22:47 +020011252 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 case PyUnicode_1BYTE_KIND:
11254 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11255 break;
11256 case PyUnicode_2BYTE_KIND:
11257 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_4BYTE_KIND:
11260 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 default:
11263 result = -1;
11264 assert(0);
11265 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011266
Victor Stinner77282cb2013-04-14 19:22:47 +020011267 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 PyMem_Free(buf2);
11269
Guido van Rossum403d68b2000-03-13 15:55:09 +000011270 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011271}
11272
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273/* Concat to string or Unicode object giving a new Unicode object. */
11274
Alexander Belopolsky40018472011-02-26 01:02:56 +000011275PyObject *
11276PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011278 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011279 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011282 if (ensure_unicode(left) < 0)
11283 return NULL;
11284
11285 if (!PyUnicode_Check(right)) {
11286 PyErr_Format(PyExc_TypeError,
11287 "can only concatenate str (not \"%.200s\") to str",
11288 right->ob_type->tp_name);
11289 return NULL;
11290 }
11291 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
11294 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 if (left == unicode_empty)
11296 return PyUnicode_FromObject(right);
11297 if (right == unicode_empty)
11298 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 left_len = PyUnicode_GET_LENGTH(left);
11301 right_len = PyUnicode_GET_LENGTH(right);
11302 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011303 PyErr_SetString(PyExc_OverflowError,
11304 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011306 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011308
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11310 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011311 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 result = PyUnicode_New(new_len, maxchar);
11315 if (result == NULL)
11316 return NULL;
11317 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11318 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11319 assert(_PyUnicode_CheckConsistency(result, 1));
11320 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
Walter Dörwald1ab83302007-05-18 17:15:44 +000011323void
Victor Stinner23e56682011-10-03 03:54:37 +020011324PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011325{
Victor Stinner23e56682011-10-03 03:54:37 +020011326 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011327 Py_UCS4 maxchar, maxchar2;
11328 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011329
11330 if (p_left == NULL) {
11331 if (!PyErr_Occurred())
11332 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011333 return;
11334 }
Victor Stinner23e56682011-10-03 03:54:37 +020011335 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011336 if (right == NULL || left == NULL
11337 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011338 if (!PyErr_Occurred())
11339 PyErr_BadInternalCall();
11340 goto error;
11341 }
11342
Benjamin Petersonbac79492012-01-14 13:34:47 -050011343 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011344 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011345 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011346 goto error;
11347
Victor Stinner488fa492011-12-12 00:01:39 +010011348 /* Shortcuts */
11349 if (left == unicode_empty) {
11350 Py_DECREF(left);
11351 Py_INCREF(right);
11352 *p_left = right;
11353 return;
11354 }
11355 if (right == unicode_empty)
11356 return;
11357
11358 left_len = PyUnicode_GET_LENGTH(left);
11359 right_len = PyUnicode_GET_LENGTH(right);
11360 if (left_len > PY_SSIZE_T_MAX - right_len) {
11361 PyErr_SetString(PyExc_OverflowError,
11362 "strings are too large to concat");
11363 goto error;
11364 }
11365 new_len = left_len + right_len;
11366
11367 if (unicode_modifiable(left)
11368 && PyUnicode_CheckExact(right)
11369 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011370 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11371 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011372 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011373 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011374 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11375 {
11376 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011377 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011378 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011379
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011380 /* copy 'right' into the newly allocated area of 'left' */
11381 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011382 }
Victor Stinner488fa492011-12-12 00:01:39 +010011383 else {
11384 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11385 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011386 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011387
Victor Stinner488fa492011-12-12 00:01:39 +010011388 /* Concat the two Unicode strings */
11389 res = PyUnicode_New(new_len, maxchar);
11390 if (res == NULL)
11391 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011392 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11393 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011394 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011395 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011396 }
11397 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011398 return;
11399
11400error:
Victor Stinner488fa492011-12-12 00:01:39 +010011401 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011402}
11403
11404void
11405PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11406{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011407 PyUnicode_Append(pleft, right);
11408 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011409}
11410
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011411/*
11412Wraps stringlib_parse_args_finds() and additionally ensures that the
11413first argument is a unicode object.
11414*/
11415
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011416static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011417parse_args_finds_unicode(const char * function_name, PyObject *args,
11418 PyObject **substring,
11419 Py_ssize_t *start, Py_ssize_t *end)
11420{
11421 if(stringlib_parse_args_finds(function_name, args, substring,
11422 start, end)) {
11423 if (ensure_unicode(*substring) < 0)
11424 return 0;
11425 return 1;
11426 }
11427 return 0;
11428}
11429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011433Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011434string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011435interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011438unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011440 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011441 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011442 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011444 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 void *buf1, *buf2;
11446 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011448 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 kind1 = PyUnicode_KIND(self);
11452 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011454 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 len1 = PyUnicode_GET_LENGTH(self);
11457 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011459 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011460 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011461
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 buf1 = PyUnicode_DATA(self);
11463 buf2 = PyUnicode_DATA(substring);
11464 if (kind2 != kind1) {
11465 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011466 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011467 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011468 }
11469 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 case PyUnicode_1BYTE_KIND:
11471 iresult = ucs1lib_count(
11472 ((Py_UCS1*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 case PyUnicode_2BYTE_KIND:
11477 iresult = ucs2lib_count(
11478 ((Py_UCS2*)buf1) + start, end - start,
11479 buf2, len2, PY_SSIZE_T_MAX
11480 );
11481 break;
11482 case PyUnicode_4BYTE_KIND:
11483 iresult = ucs4lib_count(
11484 ((Py_UCS4*)buf1) + start, end - start,
11485 buf2, len2, PY_SSIZE_T_MAX
11486 );
11487 break;
11488 default:
11489 assert(0); iresult = 0;
11490 }
11491
11492 result = PyLong_FromSsize_t(iresult);
11493
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011494 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 return result;
11498}
11499
INADA Naoki3ae20562017-01-16 20:41:20 +090011500/*[clinic input]
11501str.encode as unicode_encode
11502
11503 encoding: str(c_default="NULL") = 'utf-8'
11504 The encoding in which to encode the string.
11505 errors: str(c_default="NULL") = 'strict'
11506 The error handling scheme to use for encoding errors.
11507 The default is 'strict' meaning that encoding errors raise a
11508 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11509 'xmlcharrefreplace' as well as any other name registered with
11510 codecs.register_error that can handle UnicodeEncodeErrors.
11511
11512Encode the string using the codec registered for encoding.
11513[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
11515static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011516unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011517/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011519 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011520}
11521
INADA Naoki3ae20562017-01-16 20:41:20 +090011522/*[clinic input]
11523str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
INADA Naoki3ae20562017-01-16 20:41:20 +090011525 tabsize: int = 8
11526
11527Return a copy where all tab characters are expanded using spaces.
11528
11529If tabsize is not given, a tab size of 8 characters is assumed.
11530[clinic start generated code]*/
11531
11532static PyObject *
11533unicode_expandtabs_impl(PyObject *self, int tabsize)
11534/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011536 Py_ssize_t i, j, line_pos, src_len, incr;
11537 Py_UCS4 ch;
11538 PyObject *u;
11539 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011541 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Antoine Pitrou22425222011-10-04 19:10:51 +020011543 if (PyUnicode_READY(self) == -1)
11544 return NULL;
11545
Thomas Wouters7e474022000-07-16 12:04:32 +000011546 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011547 src_len = PyUnicode_GET_LENGTH(self);
11548 i = j = line_pos = 0;
11549 kind = PyUnicode_KIND(self);
11550 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011551 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011552 for (; i < src_len; i++) {
11553 ch = PyUnicode_READ(kind, src_data, i);
11554 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011555 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011557 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011559 goto overflow;
11560 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011562 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 goto overflow;
11567 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 if (ch == '\n' || ch == '\r')
11570 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011573 if (!found)
11574 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011575
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011577 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 if (!u)
11579 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 for (; i < src_len; i++) {
11585 ch = PyUnicode_READ(kind, src_data, i);
11586 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011588 incr = tabsize - (line_pos % tabsize);
11589 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011590 FILL(kind, dest_data, ' ', j, incr);
11591 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011593 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 line_pos++;
11596 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011597 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011598 if (ch == '\n' || ch == '\r')
11599 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 }
11602 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011603 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011604
Antoine Pitroue71d5742011-10-04 15:55:09 +020011605 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011606 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11607 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608}
11609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011610PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612\n\
11613Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011614such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615arguments start and end are interpreted as in slice notation.\n\
11616\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011622 /* initialize variables to prevent gcc warning */
11623 PyObject *substring = NULL;
11624 Py_ssize_t start = 0;
11625 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011628 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011631 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011634 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (result == -2)
11637 return NULL;
11638
Christian Heimes217cfd12007-12-02 14:31:20 +000011639 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640}
11641
11642static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011643unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011645 void *data;
11646 enum PyUnicode_Kind kind;
11647 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011648
11649 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11650 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011652 }
11653 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11654 PyErr_SetString(PyExc_IndexError, "string index out of range");
11655 return NULL;
11656 }
11657 kind = PyUnicode_KIND(self);
11658 data = PyUnicode_DATA(self);
11659 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011660 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661}
11662
Guido van Rossumc2504932007-09-18 19:42:40 +000011663/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011664 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011665static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667{
Guido van Rossumc2504932007-09-18 19:42:40 +000011668 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011669 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011670
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011671#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011672 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011673#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (_PyUnicode_HASH(self) != -1)
11675 return _PyUnicode_HASH(self);
11676 if (PyUnicode_READY(self) == -1)
11677 return -1;
11678 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011679 /*
11680 We make the hash of the empty string be 0, rather than using
11681 (prefix ^ suffix), since this slightly obfuscates the hash secret
11682 */
11683 if (len == 0) {
11684 _PyUnicode_HASH(self) = 0;
11685 return 0;
11686 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011687 x = _Py_HashBytes(PyUnicode_DATA(self),
11688 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011690 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691}
11692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011696Return the lowest index in S where substring sub is found, \n\
11697such that sub is contained within S[start:end]. Optional\n\
11698arguments start and end are interpreted as in slice notation.\n\
11699\n\
11700Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
11702static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011705 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011706 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011707 PyObject *substring = NULL;
11708 Py_ssize_t start = 0;
11709 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011711 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011714 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011717 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (result == -2)
11720 return NULL;
11721
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 if (result < 0) {
11723 PyErr_SetString(PyExc_ValueError, "substring not found");
11724 return NULL;
11725 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011726
Christian Heimes217cfd12007-12-02 14:31:20 +000011727 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
INADA Naoki3ae20562017-01-16 20:41:20 +090011730/*[clinic input]
11731str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
INADA Naoki3ae20562017-01-16 20:41:20 +090011733Return True if the string is a lowercase string, False otherwise.
11734
11735A string is lowercase if all cased characters in the string are lowercase and
11736there is at least one cased character in the string.
11737[clinic start generated code]*/
11738
11739static PyObject *
11740unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011741/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 Py_ssize_t i, length;
11744 int kind;
11745 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 int cased;
11747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (PyUnicode_READY(self) == -1)
11749 return NULL;
11750 length = PyUnicode_GET_LENGTH(self);
11751 kind = PyUnicode_KIND(self);
11752 data = PyUnicode_DATA(self);
11753
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (length == 1)
11756 return PyBool_FromLong(
11757 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011759 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011761 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 for (i = 0; i < length; i++) {
11765 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011766
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011768 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 else if (!cased && Py_UNICODE_ISLOWER(ch))
11770 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011772 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773}
11774
INADA Naoki3ae20562017-01-16 20:41:20 +090011775/*[clinic input]
11776str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
INADA Naoki3ae20562017-01-16 20:41:20 +090011778Return True if the string is an uppercase string, False otherwise.
11779
11780A string is uppercase if all cased characters in the string are uppercase and
11781there is at least one cased character in the string.
11782[clinic start generated code]*/
11783
11784static PyObject *
11785unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011786/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 Py_ssize_t i, length;
11789 int kind;
11790 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 int cased;
11792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795 length = PyUnicode_GET_LENGTH(self);
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
11798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 1)
11801 return PyBool_FromLong(
11802 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011804 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011806 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011807
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 for (i = 0; i < length; i++) {
11810 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011811
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011813 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 else if (!cased && Py_UNICODE_ISUPPER(ch))
11815 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011817 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818}
11819
INADA Naoki3ae20562017-01-16 20:41:20 +090011820/*[clinic input]
11821str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
INADA Naoki3ae20562017-01-16 20:41:20 +090011823Return True if the string is a title-cased string, False otherwise.
11824
11825In a title-cased string, upper- and title-case characters may only
11826follow uncased characters and lowercase characters only cased ones.
11827[clinic start generated code]*/
11828
11829static PyObject *
11830unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011831/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 Py_ssize_t i, length;
11834 int kind;
11835 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 int cased, previous_is_cased;
11837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 if (PyUnicode_READY(self) == -1)
11839 return NULL;
11840 length = PyUnicode_GET_LENGTH(self);
11841 kind = PyUnicode_KIND(self);
11842 data = PyUnicode_DATA(self);
11843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (length == 1) {
11846 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11847 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11848 (Py_UNICODE_ISUPPER(ch) != 0));
11849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011851 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011853 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011854
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 cased = 0;
11856 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 for (i = 0; i < length; i++) {
11858 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011859
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11861 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 previous_is_cased = 1;
11864 cased = 1;
11865 }
11866 else if (Py_UNICODE_ISLOWER(ch)) {
11867 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011868 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 previous_is_cased = 1;
11870 cased = 1;
11871 }
11872 else
11873 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011875 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876}
11877
INADA Naoki3ae20562017-01-16 20:41:20 +090011878/*[clinic input]
11879str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
INADA Naoki3ae20562017-01-16 20:41:20 +090011881Return True if the string is a whitespace string, False otherwise.
11882
11883A string is whitespace if all characters in the string are whitespace and there
11884is at least one character in the string.
11885[clinic start generated code]*/
11886
11887static PyObject *
11888unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011889/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 Py_ssize_t i, length;
11892 int kind;
11893 void *data;
11894
11895 if (PyUnicode_READY(self) == -1)
11896 return NULL;
11897 length = PyUnicode_GET_LENGTH(self);
11898 kind = PyUnicode_KIND(self);
11899 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (length == 1)
11903 return PyBool_FromLong(
11904 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011906 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011908 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 for (i = 0; i < length; i++) {
11911 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011912 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011913 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011915 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916}
11917
INADA Naoki3ae20562017-01-16 20:41:20 +090011918/*[clinic input]
11919str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011920
INADA Naoki3ae20562017-01-16 20:41:20 +090011921Return True if the string is an alphabetic string, False otherwise.
11922
11923A string is alphabetic if all characters in the string are alphabetic and there
11924is at least one character in the string.
11925[clinic start generated code]*/
11926
11927static PyObject *
11928unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011929/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 Py_ssize_t i, length;
11932 int kind;
11933 void *data;
11934
11935 if (PyUnicode_READY(self) == -1)
11936 return NULL;
11937 length = PyUnicode_GET_LENGTH(self);
11938 kind = PyUnicode_KIND(self);
11939 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (length == 1)
11943 return PyBool_FromLong(
11944 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945
11946 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011948 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 for (i = 0; i < length; i++) {
11951 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011952 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011953 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011954 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011955}
11956
INADA Naoki3ae20562017-01-16 20:41:20 +090011957/*[clinic input]
11958str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959
INADA Naoki3ae20562017-01-16 20:41:20 +090011960Return True if the string is an alpha-numeric string, False otherwise.
11961
11962A string is alpha-numeric if all characters in the string are alpha-numeric and
11963there is at least one character in the string.
11964[clinic start generated code]*/
11965
11966static PyObject *
11967unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011968/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 int kind;
11971 void *data;
11972 Py_ssize_t len, i;
11973
11974 if (PyUnicode_READY(self) == -1)
11975 return NULL;
11976
11977 kind = PyUnicode_KIND(self);
11978 data = PyUnicode_DATA(self);
11979 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (len == 1) {
11983 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11984 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11985 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011986
11987 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 for (i = 0; i < len; i++) {
11992 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011993 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011995 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011997}
11998
INADA Naoki3ae20562017-01-16 20:41:20 +090011999/*[clinic input]
12000str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
INADA Naoki3ae20562017-01-16 20:41:20 +090012002Return True if the string is a decimal string, False otherwise.
12003
12004A string is a decimal string if all characters in the string are decimal and
12005there is at least one character in the string.
12006[clinic start generated code]*/
12007
12008static PyObject *
12009unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012010/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 Py_ssize_t i, length;
12013 int kind;
12014 void *data;
12015
12016 if (PyUnicode_READY(self) == -1)
12017 return NULL;
12018 length = PyUnicode_GET_LENGTH(self);
12019 kind = PyUnicode_KIND(self);
12020 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (length == 1)
12024 return PyBool_FromLong(
12025 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012027 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012029 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 for (i = 0; i < length; i++) {
12032 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012033 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
INADA Naoki3ae20562017-01-16 20:41:20 +090012038/*[clinic input]
12039str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
INADA Naoki3ae20562017-01-16 20:41:20 +090012041Return True if the string is a digit string, False otherwise.
12042
12043A string is a digit string if all characters in the string are digits and there
12044is at least one character in the string.
12045[clinic start generated code]*/
12046
12047static PyObject *
12048unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012049/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 Py_ssize_t i, length;
12052 int kind;
12053 void *data;
12054
12055 if (PyUnicode_READY(self) == -1)
12056 return NULL;
12057 length = PyUnicode_GET_LENGTH(self);
12058 kind = PyUnicode_KIND(self);
12059 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 if (length == 1) {
12063 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12064 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012067 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012069 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 for (i = 0; i < length; i++) {
12072 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012073 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012075 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076}
12077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078/*[clinic input]
12079str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081Return True if the string is a numeric string, False otherwise.
12082
12083A string is numeric if all characters in the string are numeric and there is at
12084least one character in the string.
12085[clinic start generated code]*/
12086
12087static PyObject *
12088unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012089/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 Py_ssize_t i, length;
12092 int kind;
12093 void *data;
12094
12095 if (PyUnicode_READY(self) == -1)
12096 return NULL;
12097 length = PyUnicode_GET_LENGTH(self);
12098 kind = PyUnicode_KIND(self);
12099 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (length == 1)
12103 return PyBool_FromLong(
12104 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012106 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012108 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 for (i = 0; i < length; i++) {
12111 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012114 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115}
12116
Martin v. Löwis47383402007-08-15 07:32:56 +000012117int
12118PyUnicode_IsIdentifier(PyObject *self)
12119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 int kind;
12121 void *data;
12122 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012123 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (PyUnicode_READY(self) == -1) {
12126 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 }
12129
12130 /* Special case for empty strings */
12131 if (PyUnicode_GET_LENGTH(self) == 0)
12132 return 0;
12133 kind = PyUnicode_KIND(self);
12134 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012135
12136 /* PEP 3131 says that the first character must be in
12137 XID_Start and subsequent characters in XID_Continue,
12138 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012140 letters, digits, underscore). However, given the current
12141 definition of XID_Start and XID_Continue, it is sufficient
12142 to check just for these, except that _ must be allowed
12143 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012145 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012146 return 0;
12147
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012148 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012151 return 1;
12152}
12153
INADA Naoki3ae20562017-01-16 20:41:20 +090012154/*[clinic input]
12155str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157Return True if the string is a valid Python identifier, False otherwise.
12158
12159Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12160"class".
12161[clinic start generated code]*/
12162
12163static PyObject *
12164unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012165/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012166{
12167 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12168}
12169
INADA Naoki3ae20562017-01-16 20:41:20 +090012170/*[clinic input]
12171str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012172
INADA Naoki3ae20562017-01-16 20:41:20 +090012173Return True if the string is printable, False otherwise.
12174
12175A string is printable if all of its characters are considered printable in
12176repr() or if it is empty.
12177[clinic start generated code]*/
12178
12179static PyObject *
12180unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012181/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 Py_ssize_t i, length;
12184 int kind;
12185 void *data;
12186
12187 if (PyUnicode_READY(self) == -1)
12188 return NULL;
12189 length = PyUnicode_GET_LENGTH(self);
12190 kind = PyUnicode_KIND(self);
12191 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012192
12193 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (length == 1)
12195 return PyBool_FromLong(
12196 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 for (i = 0; i < length; i++) {
12199 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012200 Py_RETURN_FALSE;
12201 }
12202 }
12203 Py_RETURN_TRUE;
12204}
12205
INADA Naoki3ae20562017-01-16 20:41:20 +090012206/*[clinic input]
12207str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
INADA Naoki3ae20562017-01-16 20:41:20 +090012209 iterable: object
12210 /
12211
12212Concatenate any number of strings.
12213
Martin Panter91a88662017-01-24 00:30:06 +000012214The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012215The result is returned as a new string.
12216
12217Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12218[clinic start generated code]*/
12219
12220static PyObject *
12221unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012222/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
INADA Naoki3ae20562017-01-16 20:41:20 +090012224 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
Martin v. Löwis18e16552006-02-15 17:27:45 +000012227static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012228unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (PyUnicode_READY(self) == -1)
12231 return -1;
12232 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
INADA Naoki3ae20562017-01-16 20:41:20 +090012235/*[clinic input]
12236str.ljust as unicode_ljust
12237
12238 width: Py_ssize_t
12239 fillchar: Py_UCS4 = ' '
12240 /
12241
12242Return a left-justified string of length width.
12243
12244Padding is done using the specified fill character (default is a space).
12245[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246
12247static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012248unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12249/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012251 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
Victor Stinnerc4b49542011-12-11 22:44:26 +010012254 if (PyUnicode_GET_LENGTH(self) >= width)
12255 return unicode_result_unchanged(self);
12256
12257 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
INADA Naoki3ae20562017-01-16 20:41:20 +090012260/*[clinic input]
12261str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263Return a copy of the string converted to lowercase.
12264[clinic start generated code]*/
12265
12266static PyObject *
12267unicode_lower_impl(PyObject *self)
12268/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012270 if (PyUnicode_READY(self) == -1)
12271 return NULL;
12272 if (PyUnicode_IS_ASCII(self))
12273 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012274 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275}
12276
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277#define LEFTSTRIP 0
12278#define RIGHTSTRIP 1
12279#define BOTHSTRIP 2
12280
12281/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012282static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286/* externally visible for str.strip(unicode) */
12287PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012288_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 void *data;
12291 int kind;
12292 Py_ssize_t i, j, len;
12293 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012294 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12297 return NULL;
12298
12299 kind = PyUnicode_KIND(self);
12300 data = PyUnicode_DATA(self);
12301 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012302 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12304 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012305 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 i = 0;
12308 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 while (i < len) {
12310 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12311 if (!BLOOM(sepmask, ch))
12312 break;
12313 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12314 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 i++;
12316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 j = len;
12320 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012321 j--;
12322 while (j >= i) {
12323 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12324 if (!BLOOM(sepmask, ch))
12325 break;
12326 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12327 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012329 }
12330
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012332 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012333
Victor Stinner7931d9a2011-11-04 00:22:48 +010012334 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335}
12336
12337PyObject*
12338PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12339{
12340 unsigned char *data;
12341 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012342 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343
Victor Stinnerde636f32011-10-01 03:55:54 +020012344 if (PyUnicode_READY(self) == -1)
12345 return NULL;
12346
Victor Stinner684d5fd2012-05-03 02:32:34 +020012347 length = PyUnicode_GET_LENGTH(self);
12348 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012349
Victor Stinner684d5fd2012-05-03 02:32:34 +020012350 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012351 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352
Victor Stinnerde636f32011-10-01 03:55:54 +020012353 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012354 PyErr_SetString(PyExc_IndexError, "string index out of range");
12355 return NULL;
12356 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012357 if (start >= length || end < start)
12358 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012359
Victor Stinner684d5fd2012-05-03 02:32:34 +020012360 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012361 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012362 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012363 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012364 }
12365 else {
12366 kind = PyUnicode_KIND(self);
12367 data = PyUnicode_1BYTE_DATA(self);
12368 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012369 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012370 length);
12371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373
12374static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 Py_ssize_t len, i, j;
12378
12379 if (PyUnicode_READY(self) == -1)
12380 return NULL;
12381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383
Victor Stinnercc7af722013-04-09 22:39:24 +020012384 if (PyUnicode_IS_ASCII(self)) {
12385 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12386
12387 i = 0;
12388 if (striptype != RIGHTSTRIP) {
12389 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012390 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012391 if (!_Py_ascii_whitespace[ch])
12392 break;
12393 i++;
12394 }
12395 }
12396
12397 j = len;
12398 if (striptype != LEFTSTRIP) {
12399 j--;
12400 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012401 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012402 if (!_Py_ascii_whitespace[ch])
12403 break;
12404 j--;
12405 }
12406 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 }
12408 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012409 else {
12410 int kind = PyUnicode_KIND(self);
12411 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012412
Victor Stinnercc7af722013-04-09 22:39:24 +020012413 i = 0;
12414 if (striptype != RIGHTSTRIP) {
12415 while (i < len) {
12416 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12417 if (!Py_UNICODE_ISSPACE(ch))
12418 break;
12419 i++;
12420 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012421 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012422
12423 j = len;
12424 if (striptype != LEFTSTRIP) {
12425 j--;
12426 while (j >= i) {
12427 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12428 if (!Py_UNICODE_ISSPACE(ch))
12429 break;
12430 j--;
12431 }
12432 j++;
12433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435
Victor Stinner7931d9a2011-11-04 00:22:48 +010012436 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437}
12438
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439
12440static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012441do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 if (sep != NULL && sep != Py_None) {
12444 if (PyUnicode_Check(sep))
12445 return _PyUnicode_XStrip(self, striptype, sep);
12446 else {
12447 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 "%s arg must be None or str",
12449 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 return NULL;
12451 }
12452 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012453
Benjamin Peterson14339b62009-01-31 16:36:08 +000012454 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455}
12456
12457
INADA Naoki3ae20562017-01-16 20:41:20 +090012458/*[clinic input]
12459str.strip as unicode_strip
12460
12461 chars: object = None
12462 /
12463
Victor Stinner0c4a8282017-01-17 02:21:47 +010012464Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012465
12466If chars is given and not None, remove characters in chars instead.
12467[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012468
12469static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012470unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012471/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012472{
INADA Naoki3ae20562017-01-16 20:41:20 +090012473 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012474}
12475
12476
INADA Naoki3ae20562017-01-16 20:41:20 +090012477/*[clinic input]
12478str.lstrip as unicode_lstrip
12479
12480 chars: object = NULL
12481 /
12482
12483Return a copy of the string with leading whitespace removed.
12484
12485If chars is given and not None, remove characters in chars instead.
12486[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012487
12488static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012489unicode_lstrip_impl(PyObject *self, PyObject *chars)
12490/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012491{
INADA Naoki3ae20562017-01-16 20:41:20 +090012492 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493}
12494
12495
INADA Naoki3ae20562017-01-16 20:41:20 +090012496/*[clinic input]
12497str.rstrip as unicode_rstrip
12498
12499 chars: object = NULL
12500 /
12501
12502Return a copy of the string with trailing whitespace removed.
12503
12504If chars is given and not None, remove characters in chars instead.
12505[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012506
12507static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012508unicode_rstrip_impl(PyObject *self, PyObject *chars)
12509/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012510{
INADA Naoki3ae20562017-01-16 20:41:20 +090012511 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012512}
12513
12514
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012516unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012518 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
Serhiy Storchaka05997252013-01-26 12:14:02 +020012521 if (len < 1)
12522 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
Victor Stinnerc4b49542011-12-11 22:44:26 +010012524 /* no repeat, return original string */
12525 if (len == 1)
12526 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012527
Benjamin Petersonbac79492012-01-14 13:34:47 -050012528 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 return NULL;
12530
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012531 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012532 PyErr_SetString(PyExc_OverflowError,
12533 "repeated string is too long");
12534 return NULL;
12535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012537
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012538 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539 if (!u)
12540 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012541 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 if (PyUnicode_GET_LENGTH(str) == 1) {
12544 const int kind = PyUnicode_KIND(str);
12545 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012546 if (kind == PyUnicode_1BYTE_KIND) {
12547 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012548 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012549 }
12550 else if (kind == PyUnicode_2BYTE_KIND) {
12551 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012552 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012553 ucs2[n] = fill_char;
12554 } else {
12555 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12556 assert(kind == PyUnicode_4BYTE_KIND);
12557 for (n = 0; n < len; ++n)
12558 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 }
12561 else {
12562 /* number of characters copied this far */
12563 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012564 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012566 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012570 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012571 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 }
12574
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012575 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012576 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Alexander Belopolsky40018472011-02-26 01:02:56 +000012579PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012580PyUnicode_Replace(PyObject *str,
12581 PyObject *substr,
12582 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012583 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12586 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012588 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589}
12590
INADA Naoki3ae20562017-01-16 20:41:20 +090012591/*[clinic input]
12592str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593
INADA Naoki3ae20562017-01-16 20:41:20 +090012594 old: unicode
12595 new: unicode
12596 count: Py_ssize_t = -1
12597 Maximum number of occurrences to replace.
12598 -1 (the default value) means replace all occurrences.
12599 /
12600
12601Return a copy with all occurrences of substring old replaced by new.
12602
12603If the optional argument count is given, only the first count occurrences are
12604replaced.
12605[clinic start generated code]*/
12606
12607static PyObject *
12608unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12609 Py_ssize_t count)
12610/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012612 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012614 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615}
12616
Alexander Belopolsky40018472011-02-26 01:02:56 +000012617static PyObject *
12618unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012620 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 Py_ssize_t isize;
12622 Py_ssize_t osize, squote, dquote, i, o;
12623 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012624 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012628 return NULL;
12629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 isize = PyUnicode_GET_LENGTH(unicode);
12631 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 /* Compute length of output, quote characters, and
12634 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012635 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 max = 127;
12637 squote = dquote = 0;
12638 ikind = PyUnicode_KIND(unicode);
12639 for (i = 0; i < isize; i++) {
12640 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012641 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 case '\'': squote++; break;
12644 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012646 incr = 2;
12647 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 default:
12649 /* Fast-path ASCII */
12650 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 ;
12654 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012657 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012659 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 if (osize > PY_SSIZE_T_MAX - incr) {
12664 PyErr_SetString(PyExc_OverflowError,
12665 "string is too long to generate repr");
12666 return NULL;
12667 }
12668 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 }
12670
12671 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012672 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012674 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 if (dquote)
12676 /* Both squote and dquote present. Use squote,
12677 and escape them */
12678 osize += squote;
12679 else
12680 quote = '"';
12681 }
Victor Stinner55c08782013-04-14 18:45:39 +020012682 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683
12684 repr = PyUnicode_New(osize, max);
12685 if (repr == NULL)
12686 return NULL;
12687 okind = PyUnicode_KIND(repr);
12688 odata = PyUnicode_DATA(repr);
12689
12690 PyUnicode_WRITE(okind, odata, 0, quote);
12691 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012692 if (unchanged) {
12693 _PyUnicode_FastCopyCharacters(repr, 1,
12694 unicode, 0,
12695 isize);
12696 }
12697 else {
12698 for (i = 0, o = 1; i < isize; i++) {
12699 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700
Victor Stinner55c08782013-04-14 18:45:39 +020012701 /* Escape quotes and backslashes */
12702 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012703 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012705 continue;
12706 }
12707
12708 /* Map special whitespace to '\t', \n', '\r' */
12709 if (ch == '\t') {
12710 PyUnicode_WRITE(okind, odata, o++, '\\');
12711 PyUnicode_WRITE(okind, odata, o++, 't');
12712 }
12713 else if (ch == '\n') {
12714 PyUnicode_WRITE(okind, odata, o++, '\\');
12715 PyUnicode_WRITE(okind, odata, o++, 'n');
12716 }
12717 else if (ch == '\r') {
12718 PyUnicode_WRITE(okind, odata, o++, '\\');
12719 PyUnicode_WRITE(okind, odata, o++, 'r');
12720 }
12721
12722 /* Map non-printable US ASCII to '\xhh' */
12723 else if (ch < ' ' || ch == 0x7F) {
12724 PyUnicode_WRITE(okind, odata, o++, '\\');
12725 PyUnicode_WRITE(okind, odata, o++, 'x');
12726 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12727 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12728 }
12729
12730 /* Copy ASCII characters as-is */
12731 else if (ch < 0x7F) {
12732 PyUnicode_WRITE(okind, odata, o++, ch);
12733 }
12734
12735 /* Non-ASCII characters */
12736 else {
12737 /* Map Unicode whitespace and control characters
12738 (categories Z* and C* except ASCII space)
12739 */
12740 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12741 PyUnicode_WRITE(okind, odata, o++, '\\');
12742 /* Map 8-bit characters to '\xhh' */
12743 if (ch <= 0xff) {
12744 PyUnicode_WRITE(okind, odata, o++, 'x');
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12747 }
12748 /* Map 16-bit characters to '\uxxxx' */
12749 else if (ch <= 0xffff) {
12750 PyUnicode_WRITE(okind, odata, o++, 'u');
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12755 }
12756 /* Map 21-bit characters to '\U00xxxxxx' */
12757 else {
12758 PyUnicode_WRITE(okind, odata, o++, 'U');
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12767 }
12768 }
12769 /* Copy characters as-is */
12770 else {
12771 PyUnicode_WRITE(okind, odata, o++, ch);
12772 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012773 }
12774 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012777 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012778 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779}
12780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012781PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783\n\
12784Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012785such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786arguments start and end are interpreted as in slice notation.\n\
12787\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012788Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
12790static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012793 /* initialize variables to prevent gcc warning */
12794 PyObject *substring = NULL;
12795 Py_ssize_t start = 0;
12796 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012799 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012802 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012805 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 if (result == -2)
12808 return NULL;
12809
Christian Heimes217cfd12007-12-02 14:31:20 +000012810 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811}
12812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012813PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012816Return the highest index in S where substring sub is found,\n\
12817such that sub is contained within S[start:end]. Optional\n\
12818arguments start and end are interpreted as in slice notation.\n\
12819\n\
12820Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
12822static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012825 /* initialize variables to prevent gcc warning */
12826 PyObject *substring = NULL;
12827 Py_ssize_t start = 0;
12828 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012831 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012834 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012837 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 if (result == -2)
12840 return NULL;
12841
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842 if (result < 0) {
12843 PyErr_SetString(PyExc_ValueError, "substring not found");
12844 return NULL;
12845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846
Christian Heimes217cfd12007-12-02 14:31:20 +000012847 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848}
12849
INADA Naoki3ae20562017-01-16 20:41:20 +090012850/*[clinic input]
12851str.rjust as unicode_rjust
12852
12853 width: Py_ssize_t
12854 fillchar: Py_UCS4 = ' '
12855 /
12856
12857Return a right-justified string of length width.
12858
12859Padding is done using the specified fill character (default is a space).
12860[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861
12862static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012863unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12864/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012866 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867 return NULL;
12868
Victor Stinnerc4b49542011-12-11 22:44:26 +010012869 if (PyUnicode_GET_LENGTH(self) >= width)
12870 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
Victor Stinnerc4b49542011-12-11 22:44:26 +010012872 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873}
12874
Alexander Belopolsky40018472011-02-26 01:02:56 +000012875PyObject *
12876PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012878 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012881 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882}
12883
INADA Naoki3ae20562017-01-16 20:41:20 +090012884/*[clinic input]
12885str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886
INADA Naoki3ae20562017-01-16 20:41:20 +090012887 sep: object = None
12888 The delimiter according which to split the string.
12889 None (the default value) means split according to any whitespace,
12890 and discard empty strings from the result.
12891 maxsplit: Py_ssize_t = -1
12892 Maximum number of splits to do.
12893 -1 (the default value) means no limit.
12894
12895Return a list of the words in the string, using sep as the delimiter string.
12896[clinic start generated code]*/
12897
12898static PyObject *
12899unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12900/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901{
INADA Naoki3ae20562017-01-16 20:41:20 +090012902 if (sep == Py_None)
12903 return split(self, NULL, maxsplit);
12904 if (PyUnicode_Check(sep))
12905 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012906
12907 PyErr_Format(PyExc_TypeError,
12908 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012909 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911}
12912
Thomas Wouters477c8d52006-05-27 19:21:47 +000012913PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012914PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012915{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012916 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012917 int kind1, kind2;
12918 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012921 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923
Victor Stinner14f8f022011-10-05 20:58:25 +020012924 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 len1 = PyUnicode_GET_LENGTH(str_obj);
12927 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012928 if (kind1 < kind2 || len1 < len2) {
12929 _Py_INCREF_UNICODE_EMPTY();
12930 if (!unicode_empty)
12931 out = NULL;
12932 else {
12933 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12934 Py_DECREF(unicode_empty);
12935 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012936 return out;
12937 }
12938 buf1 = PyUnicode_DATA(str_obj);
12939 buf2 = PyUnicode_DATA(sep_obj);
12940 if (kind2 != kind1) {
12941 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12942 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012943 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012946 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012948 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12949 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12950 else
12951 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 break;
12953 case PyUnicode_2BYTE_KIND:
12954 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12955 break;
12956 case PyUnicode_4BYTE_KIND:
12957 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12958 break;
12959 default:
12960 assert(0);
12961 out = 0;
12962 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
12967 return out;
12968}
12969
12970
12971PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012972PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012975 int kind1, kind2;
12976 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012979 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012982 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 len1 = PyUnicode_GET_LENGTH(str_obj);
12985 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 if (kind1 < kind2 || len1 < len2) {
12987 _Py_INCREF_UNICODE_EMPTY();
12988 if (!unicode_empty)
12989 out = NULL;
12990 else {
12991 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12992 Py_DECREF(unicode_empty);
12993 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012994 return out;
12995 }
12996 buf1 = PyUnicode_DATA(str_obj);
12997 buf2 = PyUnicode_DATA(sep_obj);
12998 if (kind2 != kind1) {
12999 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13000 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013001 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013004 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013006 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13007 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13008 else
13009 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 break;
13011 case PyUnicode_2BYTE_KIND:
13012 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13013 break;
13014 case PyUnicode_4BYTE_KIND:
13015 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13016 break;
13017 default:
13018 assert(0);
13019 out = 0;
13020 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013021
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013024
13025 return out;
13026}
13027
INADA Naoki3ae20562017-01-16 20:41:20 +090013028/*[clinic input]
13029str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013030
INADA Naoki3ae20562017-01-16 20:41:20 +090013031 sep: object
13032 /
13033
13034Partition the string into three parts using the given separator.
13035
13036This will search for the separator in the string. If the separator is found,
13037returns a 3-tuple containing the part before the separator, the separator
13038itself, and the part after it.
13039
13040If the separator is not found, returns a 3-tuple containing the original string
13041and two empty strings.
13042[clinic start generated code]*/
13043
13044static PyObject *
13045unicode_partition(PyObject *self, PyObject *sep)
13046/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013047{
INADA Naoki3ae20562017-01-16 20:41:20 +090013048 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049}
13050
INADA Naoki3ae20562017-01-16 20:41:20 +090013051/*[clinic input]
13052str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053
INADA Naoki3ae20562017-01-16 20:41:20 +090013054Partition the string into three parts using the given separator.
13055
13056This will search for the separator in the string, starting and the end. If
13057the separator is found, returns a 3-tuple containing the part before the
13058separator, the separator itself, and the part after it.
13059
13060If the separator is not found, returns a 3-tuple containing two empty strings
13061and the original string.
13062[clinic start generated code]*/
13063
13064static PyObject *
13065unicode_rpartition(PyObject *self, PyObject *sep)
13066/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067{
INADA Naoki3ae20562017-01-16 20:41:20 +090013068 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069}
13070
Alexander Belopolsky40018472011-02-26 01:02:56 +000013071PyObject *
13072PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013073{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013074 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013075 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013076
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013077 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013078}
13079
INADA Naoki3ae20562017-01-16 20:41:20 +090013080/*[clinic input]
13081str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013082
INADA Naoki3ae20562017-01-16 20:41:20 +090013083Return a list of the words in the string, using sep as the delimiter string.
13084
13085Splits are done starting at the end of the string and working to the front.
13086[clinic start generated code]*/
13087
13088static PyObject *
13089unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13090/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013091{
INADA Naoki3ae20562017-01-16 20:41:20 +090013092 if (sep == Py_None)
13093 return rsplit(self, NULL, maxsplit);
13094 if (PyUnicode_Check(sep))
13095 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013096
13097 PyErr_Format(PyExc_TypeError,
13098 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013099 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013100 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013101}
13102
INADA Naoki3ae20562017-01-16 20:41:20 +090013103/*[clinic input]
13104str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013106 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013107
13108Return a list of the lines in the string, breaking at line boundaries.
13109
13110Line breaks are not included in the resulting list unless keepends is given and
13111true.
13112[clinic start generated code]*/
13113
13114static PyObject *
13115unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013116/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013118 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119}
13120
13121static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013122PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013124 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125}
13126
INADA Naoki3ae20562017-01-16 20:41:20 +090013127/*[clinic input]
13128str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
INADA Naoki3ae20562017-01-16 20:41:20 +090013130Convert uppercase characters to lowercase and lowercase characters to uppercase.
13131[clinic start generated code]*/
13132
13133static PyObject *
13134unicode_swapcase_impl(PyObject *self)
13135/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013137 if (PyUnicode_READY(self) == -1)
13138 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013139 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140}
13141
Larry Hastings61272b72014-01-07 12:41:53 -080013142/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013143
Larry Hastings31826802013-10-19 00:09:25 -070013144@staticmethod
13145str.maketrans as unicode_maketrans
13146
13147 x: object
13148
13149 y: unicode=NULL
13150
13151 z: unicode=NULL
13152
13153 /
13154
13155Return a translation table usable for str.translate().
13156
13157If there is only one argument, it must be a dictionary mapping Unicode
13158ordinals (integers) or characters to Unicode ordinals, strings or None.
13159Character keys will be then converted to ordinals.
13160If there are two arguments, they must be strings of equal length, and
13161in the resulting dictionary, each character in x will be mapped to the
13162character at the same position in y. If there is a third argument, it
13163must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013164[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013165
Larry Hastings31826802013-10-19 00:09:25 -070013166static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013167unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013168/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013169{
Georg Brandlceee0772007-11-27 23:48:05 +000013170 PyObject *new = NULL, *key, *value;
13171 Py_ssize_t i = 0;
13172 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173
Georg Brandlceee0772007-11-27 23:48:05 +000013174 new = PyDict_New();
13175 if (!new)
13176 return NULL;
13177 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 int x_kind, y_kind, z_kind;
13179 void *x_data, *y_data, *z_data;
13180
Georg Brandlceee0772007-11-27 23:48:05 +000013181 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013182 if (!PyUnicode_Check(x)) {
13183 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13184 "be a string if there is a second argument");
13185 goto err;
13186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013188 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13189 "arguments must have equal length");
13190 goto err;
13191 }
13192 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 x_kind = PyUnicode_KIND(x);
13194 y_kind = PyUnicode_KIND(y);
13195 x_data = PyUnicode_DATA(x);
13196 y_data = PyUnicode_DATA(y);
13197 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13198 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013199 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013200 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013201 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013202 if (!value) {
13203 Py_DECREF(key);
13204 goto err;
13205 }
Georg Brandlceee0772007-11-27 23:48:05 +000013206 res = PyDict_SetItem(new, key, value);
13207 Py_DECREF(key);
13208 Py_DECREF(value);
13209 if (res < 0)
13210 goto err;
13211 }
13212 /* create entries for deleting chars in z */
13213 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 z_kind = PyUnicode_KIND(z);
13215 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013216 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013218 if (!key)
13219 goto err;
13220 res = PyDict_SetItem(new, key, Py_None);
13221 Py_DECREF(key);
13222 if (res < 0)
13223 goto err;
13224 }
13225 }
13226 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 int kind;
13228 void *data;
13229
Georg Brandlceee0772007-11-27 23:48:05 +000013230 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013231 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013232 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13233 "to maketrans it must be a dict");
13234 goto err;
13235 }
13236 /* copy entries into the new dict, converting string keys to int keys */
13237 while (PyDict_Next(x, &i, &key, &value)) {
13238 if (PyUnicode_Check(key)) {
13239 /* convert string keys to integer keys */
13240 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013241 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013242 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13243 "table must be of length 1");
13244 goto err;
13245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 kind = PyUnicode_KIND(key);
13247 data = PyUnicode_DATA(key);
13248 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013249 if (!newkey)
13250 goto err;
13251 res = PyDict_SetItem(new, newkey, value);
13252 Py_DECREF(newkey);
13253 if (res < 0)
13254 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013255 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013256 /* just keep integer keys */
13257 if (PyDict_SetItem(new, key, value) < 0)
13258 goto err;
13259 } else {
13260 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13261 "be strings or integers");
13262 goto err;
13263 }
13264 }
13265 }
13266 return new;
13267 err:
13268 Py_DECREF(new);
13269 return NULL;
13270}
13271
INADA Naoki3ae20562017-01-16 20:41:20 +090013272/*[clinic input]
13273str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
INADA Naoki3ae20562017-01-16 20:41:20 +090013275 table: object
13276 Translation table, which must be a mapping of Unicode ordinals to
13277 Unicode ordinals, strings, or None.
13278 /
13279
13280Replace each character in the string using the given translation table.
13281
13282The table must implement lookup/indexing via __getitem__, for instance a
13283dictionary or list. If this operation raises LookupError, the character is
13284left untouched. Characters mapped to None are deleted.
13285[clinic start generated code]*/
13286
13287static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013289/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
13293
INADA Naoki3ae20562017-01-16 20:41:20 +090013294/*[clinic input]
13295str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
INADA Naoki3ae20562017-01-16 20:41:20 +090013297Return a copy of the string converted to uppercase.
13298[clinic start generated code]*/
13299
13300static PyObject *
13301unicode_upper_impl(PyObject *self)
13302/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013304 if (PyUnicode_READY(self) == -1)
13305 return NULL;
13306 if (PyUnicode_IS_ASCII(self))
13307 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013308 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309}
13310
INADA Naoki3ae20562017-01-16 20:41:20 +090013311/*[clinic input]
13312str.zfill as unicode_zfill
13313
13314 width: Py_ssize_t
13315 /
13316
13317Pad a numeric string with zeros on the left, to fill a field of the given width.
13318
13319The string is never truncated.
13320[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321
13322static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013323unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013324/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013326 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013327 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 int kind;
13329 void *data;
13330 Py_UCS4 chr;
13331
Benjamin Petersonbac79492012-01-14 13:34:47 -050013332 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334
Victor Stinnerc4b49542011-12-11 22:44:26 +010013335 if (PyUnicode_GET_LENGTH(self) >= width)
13336 return unicode_result_unchanged(self);
13337
13338 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339
13340 u = pad(self, fill, 0, '0');
13341
Walter Dörwald068325e2002-04-15 13:36:47 +000013342 if (u == NULL)
13343 return NULL;
13344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 kind = PyUnicode_KIND(u);
13346 data = PyUnicode_DATA(u);
13347 chr = PyUnicode_READ(kind, data, fill);
13348
13349 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 PyUnicode_WRITE(kind, data, 0, chr);
13352 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353 }
13354
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013355 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013356 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358
13359#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013360static PyObject *
13361unicode__decimal2ascii(PyObject *self)
13362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013364}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365#endif
13366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013367PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013370Return True if S starts with the specified prefix, False otherwise.\n\
13371With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013372With optional end, stop comparing S at that position.\n\
13373prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374
13375static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013376unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013380 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013381 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013382 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384
Jesus Ceaac451502011-04-20 17:09:23 +020013385 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 if (PyTuple_Check(subobj)) {
13388 Py_ssize_t i;
13389 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013390 substring = PyTuple_GET_ITEM(subobj, i);
13391 if (!PyUnicode_Check(substring)) {
13392 PyErr_Format(PyExc_TypeError,
13393 "tuple for startswith must only contain str, "
13394 "not %.100s",
13395 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013396 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013398 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013399 if (result == -1)
13400 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 if (result) {
13402 Py_RETURN_TRUE;
13403 }
13404 }
13405 /* nothing matched */
13406 Py_RETURN_FALSE;
13407 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013408 if (!PyUnicode_Check(subobj)) {
13409 PyErr_Format(PyExc_TypeError,
13410 "startswith first arg must be str or "
13411 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013413 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013414 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013415 if (result == -1)
13416 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013417 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418}
13419
13420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013421PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013424Return True if S ends with the specified suffix, False otherwise.\n\
13425With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013426With optional end, stop comparing S at that position.\n\
13427suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428
13429static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013430unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013433 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013434 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013435 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013436 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013437 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438
Jesus Ceaac451502011-04-20 17:09:23 +020013439 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013441 if (PyTuple_Check(subobj)) {
13442 Py_ssize_t i;
13443 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013444 substring = PyTuple_GET_ITEM(subobj, i);
13445 if (!PyUnicode_Check(substring)) {
13446 PyErr_Format(PyExc_TypeError,
13447 "tuple for endswith must only contain str, "
13448 "not %.100s",
13449 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013451 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013452 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013453 if (result == -1)
13454 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 if (result) {
13456 Py_RETURN_TRUE;
13457 }
13458 }
13459 Py_RETURN_FALSE;
13460 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013461 if (!PyUnicode_Check(subobj)) {
13462 PyErr_Format(PyExc_TypeError,
13463 "endswith first arg must be str or "
13464 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013466 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013467 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013468 if (result == -1)
13469 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013470 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013471}
13472
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013473static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013474_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013475{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013476 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13477 writer->data = PyUnicode_DATA(writer->buffer);
13478
13479 if (!writer->readonly) {
13480 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013481 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013482 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013483 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013484 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13485 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13486 writer->kind = PyUnicode_WCHAR_KIND;
13487 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13488
Victor Stinner8f674cc2013-04-17 23:02:17 +020013489 /* Copy-on-write mode: set buffer size to 0 so
13490 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13491 * next write. */
13492 writer->size = 0;
13493 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013494}
13495
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013498{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013499 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013500
13501 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013503
13504 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13505 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13506 writer->kind = PyUnicode_WCHAR_KIND;
13507 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013508}
13509
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510int
13511_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13512 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013513{
13514 Py_ssize_t newlen;
13515 PyObject *newbuffer;
13516
Victor Stinner2740e462016-09-06 16:58:36 -070013517 assert(maxchar <= MAX_UNICODE);
13518
Victor Stinnerca9381e2015-09-22 00:58:32 +020013519 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013520 assert((maxchar > writer->maxchar && length >= 0)
13521 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522
Victor Stinner202fdca2012-05-07 12:47:02 +020013523 if (length > PY_SSIZE_T_MAX - writer->pos) {
13524 PyErr_NoMemory();
13525 return -1;
13526 }
13527 newlen = writer->pos + length;
13528
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013529 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013530
Victor Stinnerd3f08822012-05-29 12:57:52 +020013531 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013532 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013533 if (writer->overallocate
13534 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13535 /* overallocate to limit the number of realloc() */
13536 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013538 if (newlen < writer->min_length)
13539 newlen = writer->min_length;
13540
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541 writer->buffer = PyUnicode_New(newlen, maxchar);
13542 if (writer->buffer == NULL)
13543 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013545 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013546 if (writer->overallocate
13547 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13548 /* overallocate to limit the number of realloc() */
13549 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013550 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013551 if (newlen < writer->min_length)
13552 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013554 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013555 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013556 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013557 newbuffer = PyUnicode_New(newlen, maxchar);
13558 if (newbuffer == NULL)
13559 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13561 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013563 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 }
13565 else {
13566 newbuffer = resize_compact(writer->buffer, newlen);
13567 if (newbuffer == NULL)
13568 return -1;
13569 }
13570 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 }
13572 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013573 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 newbuffer = PyUnicode_New(writer->size, maxchar);
13575 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13578 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013579 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013580 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013581 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013582 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013583
13584#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013585}
13586
Victor Stinnerca9381e2015-09-22 00:58:32 +020013587int
13588_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13589 enum PyUnicode_Kind kind)
13590{
13591 Py_UCS4 maxchar;
13592
13593 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13594 assert(writer->kind < kind);
13595
13596 switch (kind)
13597 {
13598 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13599 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13600 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13601 default:
13602 assert(0 && "invalid kind");
13603 return -1;
13604 }
13605
13606 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13607}
13608
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013609static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013610_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013611{
Victor Stinner2740e462016-09-06 16:58:36 -070013612 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013613 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13614 return -1;
13615 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13616 writer->pos++;
13617 return 0;
13618}
13619
13620int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013621_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13622{
13623 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13624}
13625
13626int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013627_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13628{
13629 Py_UCS4 maxchar;
13630 Py_ssize_t len;
13631
13632 if (PyUnicode_READY(str) == -1)
13633 return -1;
13634 len = PyUnicode_GET_LENGTH(str);
13635 if (len == 0)
13636 return 0;
13637 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13638 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013639 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013640 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013641 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642 Py_INCREF(str);
13643 writer->buffer = str;
13644 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645 writer->pos += len;
13646 return 0;
13647 }
13648 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13649 return -1;
13650 }
13651 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13652 str, 0, len);
13653 writer->pos += len;
13654 return 0;
13655}
13656
Victor Stinnere215d962012-10-06 23:03:36 +020013657int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013658_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13659 Py_ssize_t start, Py_ssize_t end)
13660{
13661 Py_UCS4 maxchar;
13662 Py_ssize_t len;
13663
13664 if (PyUnicode_READY(str) == -1)
13665 return -1;
13666
13667 assert(0 <= start);
13668 assert(end <= PyUnicode_GET_LENGTH(str));
13669 assert(start <= end);
13670
13671 if (end == 0)
13672 return 0;
13673
13674 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13675 return _PyUnicodeWriter_WriteStr(writer, str);
13676
13677 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13678 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13679 else
13680 maxchar = writer->maxchar;
13681 len = end - start;
13682
13683 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13684 return -1;
13685
13686 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13687 str, start, len);
13688 writer->pos += len;
13689 return 0;
13690}
13691
13692int
Victor Stinner4a587072013-11-19 12:54:53 +010013693_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13694 const char *ascii, Py_ssize_t len)
13695{
13696 if (len == -1)
13697 len = strlen(ascii);
13698
13699 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13700
13701 if (writer->buffer == NULL && !writer->overallocate) {
13702 PyObject *str;
13703
13704 str = _PyUnicode_FromASCII(ascii, len);
13705 if (str == NULL)
13706 return -1;
13707
13708 writer->readonly = 1;
13709 writer->buffer = str;
13710 _PyUnicodeWriter_Update(writer);
13711 writer->pos += len;
13712 return 0;
13713 }
13714
13715 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13716 return -1;
13717
13718 switch (writer->kind)
13719 {
13720 case PyUnicode_1BYTE_KIND:
13721 {
13722 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13723 Py_UCS1 *data = writer->data;
13724
Christian Heimesf051e432016-09-13 20:22:02 +020013725 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013726 break;
13727 }
13728 case PyUnicode_2BYTE_KIND:
13729 {
13730 _PyUnicode_CONVERT_BYTES(
13731 Py_UCS1, Py_UCS2,
13732 ascii, ascii + len,
13733 (Py_UCS2 *)writer->data + writer->pos);
13734 break;
13735 }
13736 case PyUnicode_4BYTE_KIND:
13737 {
13738 _PyUnicode_CONVERT_BYTES(
13739 Py_UCS1, Py_UCS4,
13740 ascii, ascii + len,
13741 (Py_UCS4 *)writer->data + writer->pos);
13742 break;
13743 }
13744 default:
13745 assert(0);
13746 }
13747
13748 writer->pos += len;
13749 return 0;
13750}
13751
13752int
13753_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13754 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013755{
13756 Py_UCS4 maxchar;
13757
13758 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13759 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13760 return -1;
13761 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13762 writer->pos += len;
13763 return 0;
13764}
13765
Victor Stinnerd3f08822012-05-29 12:57:52 +020013766PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013767_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013768{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013769 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013770
Victor Stinnerd3f08822012-05-29 12:57:52 +020013771 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013772 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013773 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013774 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013775
13776 str = writer->buffer;
13777 writer->buffer = NULL;
13778
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013779 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013780 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13781 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013782 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013783
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013784 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13785 PyObject *str2;
13786 str2 = resize_compact(str, writer->pos);
13787 if (str2 == NULL) {
13788 Py_DECREF(str);
13789 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013790 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013791 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013792 }
13793
Victor Stinner15a0bd32013-07-08 22:29:55 +020013794 assert(_PyUnicode_CheckConsistency(str, 1));
13795 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013796}
13797
Victor Stinnerd3f08822012-05-29 12:57:52 +020013798void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013799_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013800{
13801 Py_CLEAR(writer->buffer);
13802}
13803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013805
13806PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013808\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013809Return a formatted version of S, using substitutions from args and kwargs.\n\
13810The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013811
Eric Smith27bbca62010-11-04 17:06:58 +000013812PyDoc_STRVAR(format_map__doc__,
13813 "S.format_map(mapping) -> str\n\
13814\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013815Return a formatted version of S, using substitutions from mapping.\n\
13816The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013817
INADA Naoki3ae20562017-01-16 20:41:20 +090013818/*[clinic input]
13819str.__format__ as unicode___format__
13820
13821 format_spec: unicode
13822 /
13823
13824Return a formatted version of the string as described by format_spec.
13825[clinic start generated code]*/
13826
Eric Smith4a7d76d2008-05-30 18:10:19 +000013827static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013828unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013829/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013830{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013831 _PyUnicodeWriter writer;
13832 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013833
Victor Stinnerd3f08822012-05-29 12:57:52 +020013834 if (PyUnicode_READY(self) == -1)
13835 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013836 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13838 self, format_spec, 0,
13839 PyUnicode_GET_LENGTH(format_spec));
13840 if (ret == -1) {
13841 _PyUnicodeWriter_Dealloc(&writer);
13842 return NULL;
13843 }
13844 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013845}
13846
INADA Naoki3ae20562017-01-16 20:41:20 +090013847/*[clinic input]
13848str.__sizeof__ as unicode_sizeof
13849
13850Return the size of the string in memory, in bytes.
13851[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013852
13853static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013854unicode_sizeof_impl(PyObject *self)
13855/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013857 Py_ssize_t size;
13858
13859 /* If it's a compact object, account for base structure +
13860 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013861 if (PyUnicode_IS_COMPACT_ASCII(self))
13862 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13863 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866 else {
13867 /* If it is a two-block object, account for base object, and
13868 for character block if present. */
13869 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013870 if (_PyUnicode_DATA_ANY(self))
13871 size += (PyUnicode_GET_LENGTH(self) + 1) *
13872 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013873 }
13874 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013875 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013876 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13877 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13878 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13879 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880
13881 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013882}
13883
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013884static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013885unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013886{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013887 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 if (!copy)
13889 return NULL;
13890 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013891}
13892
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013894 UNICODE_ENCODE_METHODDEF
13895 UNICODE_REPLACE_METHODDEF
13896 UNICODE_SPLIT_METHODDEF
13897 UNICODE_RSPLIT_METHODDEF
13898 UNICODE_JOIN_METHODDEF
13899 UNICODE_CAPITALIZE_METHODDEF
13900 UNICODE_CASEFOLD_METHODDEF
13901 UNICODE_TITLE_METHODDEF
13902 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013903 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013904 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013905 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013906 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_LJUST_METHODDEF
13909 UNICODE_LOWER_METHODDEF
13910 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013911 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13912 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013913 UNICODE_RJUST_METHODDEF
13914 UNICODE_RSTRIP_METHODDEF
13915 UNICODE_RPARTITION_METHODDEF
13916 UNICODE_SPLITLINES_METHODDEF
13917 UNICODE_STRIP_METHODDEF
13918 UNICODE_SWAPCASE_METHODDEF
13919 UNICODE_TRANSLATE_METHODDEF
13920 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013921 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13922 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE_ISLOWER_METHODDEF
13924 UNICODE_ISUPPER_METHODDEF
13925 UNICODE_ISTITLE_METHODDEF
13926 UNICODE_ISSPACE_METHODDEF
13927 UNICODE_ISDECIMAL_METHODDEF
13928 UNICODE_ISDIGIT_METHODDEF
13929 UNICODE_ISNUMERIC_METHODDEF
13930 UNICODE_ISALPHA_METHODDEF
13931 UNICODE_ISALNUM_METHODDEF
13932 UNICODE_ISIDENTIFIER_METHODDEF
13933 UNICODE_ISPRINTABLE_METHODDEF
13934 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013935 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013936 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013937 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013938 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013939 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013940#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013941 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013942 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013943#endif
13944
Benjamin Peterson14339b62009-01-31 16:36:08 +000013945 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013946 {NULL, NULL}
13947};
13948
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013949static PyObject *
13950unicode_mod(PyObject *v, PyObject *w)
13951{
Brian Curtindfc80e32011-08-10 20:28:54 -050013952 if (!PyUnicode_Check(v))
13953 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013954 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013955}
13956
13957static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 0, /*nb_add*/
13959 0, /*nb_subtract*/
13960 0, /*nb_multiply*/
13961 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013962};
13963
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 (lenfunc) unicode_length, /* sq_length */
13966 PyUnicode_Concat, /* sq_concat */
13967 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13968 (ssizeargfunc) unicode_getitem, /* sq_item */
13969 0, /* sq_slice */
13970 0, /* sq_ass_item */
13971 0, /* sq_ass_slice */
13972 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013973};
13974
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013975static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013976unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013978 if (PyUnicode_READY(self) == -1)
13979 return NULL;
13980
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013981 if (PyIndex_Check(item)) {
13982 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983 if (i == -1 && PyErr_Occurred())
13984 return NULL;
13985 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013987 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013989 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013990 PyObject *result;
13991 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013992 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013993 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013994
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013995 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 return NULL;
13997 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013998 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13999 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014000
14001 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014002 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014003 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014004 slicelength == PyUnicode_GET_LENGTH(self)) {
14005 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014006 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014007 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014008 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014009 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014010 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 src_kind = PyUnicode_KIND(self);
14012 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014013 if (!PyUnicode_IS_ASCII(self)) {
14014 kind_limit = kind_maxchar_limit(src_kind);
14015 max_char = 0;
14016 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14017 ch = PyUnicode_READ(src_kind, src_data, cur);
14018 if (ch > max_char) {
14019 max_char = ch;
14020 if (max_char >= kind_limit)
14021 break;
14022 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014023 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 }
Victor Stinner55c99112011-10-13 01:17:06 +020014025 else
14026 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014027 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014028 if (result == NULL)
14029 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014031 dest_data = PyUnicode_DATA(result);
14032
14033 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014034 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14035 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014036 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014037 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014038 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014039 } else {
14040 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14041 return NULL;
14042 }
14043}
14044
14045static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014046 (lenfunc)unicode_length, /* mp_length */
14047 (binaryfunc)unicode_subscript, /* mp_subscript */
14048 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014049};
14050
Guido van Rossumd57fd912000-03-10 22:53:23 +000014051
Guido van Rossumd57fd912000-03-10 22:53:23 +000014052/* Helpers for PyUnicode_Format() */
14053
Victor Stinnera47082312012-10-04 02:19:54 +020014054struct unicode_formatter_t {
14055 PyObject *args;
14056 int args_owned;
14057 Py_ssize_t arglen, argidx;
14058 PyObject *dict;
14059
14060 enum PyUnicode_Kind fmtkind;
14061 Py_ssize_t fmtcnt, fmtpos;
14062 void *fmtdata;
14063 PyObject *fmtstr;
14064
14065 _PyUnicodeWriter writer;
14066};
14067
14068struct unicode_format_arg_t {
14069 Py_UCS4 ch;
14070 int flags;
14071 Py_ssize_t width;
14072 int prec;
14073 int sign;
14074};
14075
Guido van Rossumd57fd912000-03-10 22:53:23 +000014076static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014077unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014078{
Victor Stinnera47082312012-10-04 02:19:54 +020014079 Py_ssize_t argidx = ctx->argidx;
14080
14081 if (argidx < ctx->arglen) {
14082 ctx->argidx++;
14083 if (ctx->arglen < 0)
14084 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014085 else
Victor Stinnera47082312012-10-04 02:19:54 +020014086 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014087 }
14088 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014089 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090 return NULL;
14091}
14092
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014093/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094
Victor Stinnera47082312012-10-04 02:19:54 +020014095/* Format a float into the writer if the writer is not NULL, or into *p_output
14096 otherwise.
14097
14098 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099static int
Victor Stinnera47082312012-10-04 02:19:54 +020014100formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14101 PyObject **p_output,
14102 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014103{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014104 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014105 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014106 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014107 int prec;
14108 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014109
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110 x = PyFloat_AsDouble(v);
14111 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014113
Victor Stinnera47082312012-10-04 02:19:54 +020014114 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014117
Victor Stinnera47082312012-10-04 02:19:54 +020014118 if (arg->flags & F_ALT)
14119 dtoa_flags = Py_DTSF_ALT;
14120 else
14121 dtoa_flags = 0;
14122 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014123 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014124 return -1;
14125 len = strlen(p);
14126 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014127 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014128 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014129 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014130 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014131 }
14132 else
14133 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014134 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136}
14137
Victor Stinnerd0880d52012-04-27 23:40:13 +020014138/* formatlong() emulates the format codes d, u, o, x and X, and
14139 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14140 * Python's regular ints.
14141 * Return value: a new PyUnicodeObject*, or NULL if error.
14142 * The output string is of the form
14143 * "-"? ("0x" | "0X")? digit+
14144 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14145 * set in flags. The case of hex digits will be correct,
14146 * There will be at least prec digits, zero-filled on the left if
14147 * necessary to get that many.
14148 * val object to be converted
14149 * flags bitmask of format flags; only F_ALT is looked at
14150 * prec minimum number of digits; 0-fill on left if needed
14151 * type a character in [duoxX]; u acts the same as d
14152 *
14153 * CAUTION: o, x and X conversions on regular ints can never
14154 * produce a '-' sign, but can for Python's unbounded ints.
14155 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014156PyObject *
14157_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014158{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014161 Py_ssize_t i;
14162 int sign; /* 1 if '-', else 0 */
14163 int len; /* number of characters */
14164 Py_ssize_t llen;
14165 int numdigits; /* len == numnondigits + numdigits */
14166 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014167
Victor Stinnerd0880d52012-04-27 23:40:13 +020014168 /* Avoid exceeding SSIZE_T_MAX */
14169 if (prec > INT_MAX-3) {
14170 PyErr_SetString(PyExc_OverflowError,
14171 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 }
14174
14175 assert(PyLong_Check(val));
14176
14177 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014178 default:
14179 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014181 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014183 /* int and int subclasses should print numerically when a numeric */
14184 /* format code is used (see issue18780) */
14185 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 break;
14187 case 'o':
14188 numnondigits = 2;
14189 result = PyNumber_ToBase(val, 8);
14190 break;
14191 case 'x':
14192 case 'X':
14193 numnondigits = 2;
14194 result = PyNumber_ToBase(val, 16);
14195 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014196 }
14197 if (!result)
14198 return NULL;
14199
14200 assert(unicode_modifiable(result));
14201 assert(PyUnicode_IS_READY(result));
14202 assert(PyUnicode_IS_ASCII(result));
14203
14204 /* To modify the string in-place, there can only be one reference. */
14205 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014206 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014207 PyErr_BadInternalCall();
14208 return NULL;
14209 }
14210 buf = PyUnicode_DATA(result);
14211 llen = PyUnicode_GET_LENGTH(result);
14212 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014213 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014214 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014215 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 return NULL;
14217 }
14218 len = (int)llen;
14219 sign = buf[0] == '-';
14220 numnondigits += sign;
14221 numdigits = len - numnondigits;
14222 assert(numdigits > 0);
14223
14224 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014225 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014226 (type == 'o' || type == 'x' || type == 'X'))) {
14227 assert(buf[sign] == '0');
14228 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14229 buf[sign+1] == 'o');
14230 numnondigits -= 2;
14231 buf += 2;
14232 len -= 2;
14233 if (sign)
14234 buf[0] = '-';
14235 assert(len == numnondigits + numdigits);
14236 assert(numdigits > 0);
14237 }
14238
14239 /* Fill with leading zeroes to meet minimum width. */
14240 if (prec > numdigits) {
14241 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14242 numnondigits + prec);
14243 char *b1;
14244 if (!r1) {
14245 Py_DECREF(result);
14246 return NULL;
14247 }
14248 b1 = PyBytes_AS_STRING(r1);
14249 for (i = 0; i < numnondigits; ++i)
14250 *b1++ = *buf++;
14251 for (i = 0; i < prec - numdigits; i++)
14252 *b1++ = '0';
14253 for (i = 0; i < numdigits; i++)
14254 *b1++ = *buf++;
14255 *b1 = '\0';
14256 Py_DECREF(result);
14257 result = r1;
14258 buf = PyBytes_AS_STRING(result);
14259 len = numnondigits + prec;
14260 }
14261
14262 /* Fix up case for hex conversions. */
14263 if (type == 'X') {
14264 /* Need to convert all lower case letters to upper case.
14265 and need to convert 0x to 0X (and -0x to -0X). */
14266 for (i = 0; i < len; i++)
14267 if (buf[i] >= 'a' && buf[i] <= 'x')
14268 buf[i] -= 'a'-'A';
14269 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014270 if (!PyUnicode_Check(result)
14271 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014272 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014273 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014274 Py_DECREF(result);
14275 result = unicode;
14276 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 else if (len != PyUnicode_GET_LENGTH(result)) {
14278 if (PyUnicode_Resize(&result, len) < 0)
14279 Py_CLEAR(result);
14280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014282}
14283
Ethan Furmandf3ed242014-01-05 06:50:30 -080014284/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014285 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014286 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 * -1 and raise an exception on error */
14288static int
Victor Stinnera47082312012-10-04 02:19:54 +020014289mainformatlong(PyObject *v,
14290 struct unicode_format_arg_t *arg,
14291 PyObject **p_output,
14292 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293{
14294 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014295 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014296
14297 if (!PyNumber_Check(v))
14298 goto wrongtype;
14299
Ethan Furman9ab74802014-03-21 06:38:46 -070014300 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014302 if (type == 'o' || type == 'x' || type == 'X') {
14303 iobj = PyNumber_Index(v);
14304 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014305 if (PyErr_ExceptionMatches(PyExc_TypeError))
14306 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014307 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014308 }
14309 }
14310 else {
14311 iobj = PyNumber_Long(v);
14312 if (iobj == NULL ) {
14313 if (PyErr_ExceptionMatches(PyExc_TypeError))
14314 goto wrongtype;
14315 return -1;
14316 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317 }
14318 assert(PyLong_Check(iobj));
14319 }
14320 else {
14321 iobj = v;
14322 Py_INCREF(iobj);
14323 }
14324
14325 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014326 && arg->width == -1 && arg->prec == -1
14327 && !(arg->flags & (F_SIGN | F_BLANK))
14328 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014329 {
14330 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014331 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014332 int base;
14333
Victor Stinnera47082312012-10-04 02:19:54 +020014334 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014335 {
14336 default:
14337 assert(0 && "'type' not in [diuoxX]");
14338 case 'd':
14339 case 'i':
14340 case 'u':
14341 base = 10;
14342 break;
14343 case 'o':
14344 base = 8;
14345 break;
14346 case 'x':
14347 case 'X':
14348 base = 16;
14349 break;
14350 }
14351
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014352 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14353 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014354 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014355 }
14356 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 return 1;
14358 }
14359
Ethan Furmanb95b5612015-01-23 20:05:18 -080014360 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014361 Py_DECREF(iobj);
14362 if (res == NULL)
14363 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014364 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 return 0;
14366
14367wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014368 switch(type)
14369 {
14370 case 'o':
14371 case 'x':
14372 case 'X':
14373 PyErr_Format(PyExc_TypeError,
14374 "%%%c format: an integer is required, "
14375 "not %.200s",
14376 type, Py_TYPE(v)->tp_name);
14377 break;
14378 default:
14379 PyErr_Format(PyExc_TypeError,
14380 "%%%c format: a number is required, "
14381 "not %.200s",
14382 type, Py_TYPE(v)->tp_name);
14383 break;
14384 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014385 return -1;
14386}
14387
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014388static Py_UCS4
14389formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014390{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014391 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014392 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014393 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014394 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014395 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014396 goto onError;
14397 }
14398 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014399 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014401 /* make sure number is a type of integer */
14402 if (!PyLong_Check(v)) {
14403 iobj = PyNumber_Index(v);
14404 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014405 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014406 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014407 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014408 Py_DECREF(iobj);
14409 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014410 else {
14411 x = PyLong_AsLong(v);
14412 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014413 if (x == -1 && PyErr_Occurred())
14414 goto onError;
14415
Victor Stinner8faf8212011-12-08 22:14:11 +010014416 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 PyErr_SetString(PyExc_OverflowError,
14418 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014419 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 }
14421
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014422 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014424
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014426 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014428 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014429}
14430
Victor Stinnera47082312012-10-04 02:19:54 +020014431/* Parse options of an argument: flags, width, precision.
14432 Handle also "%(name)" syntax.
14433
14434 Return 0 if the argument has been formatted into arg->str.
14435 Return 1 if the argument has been written into ctx->writer,
14436 Raise an exception and return -1 on error. */
14437static int
14438unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14439 struct unicode_format_arg_t *arg)
14440{
14441#define FORMAT_READ(ctx) \
14442 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14443
14444 PyObject *v;
14445
Victor Stinnera47082312012-10-04 02:19:54 +020014446 if (arg->ch == '(') {
14447 /* Get argument value from a dictionary. Example: "%(name)s". */
14448 Py_ssize_t keystart;
14449 Py_ssize_t keylen;
14450 PyObject *key;
14451 int pcount = 1;
14452
14453 if (ctx->dict == NULL) {
14454 PyErr_SetString(PyExc_TypeError,
14455 "format requires a mapping");
14456 return -1;
14457 }
14458 ++ctx->fmtpos;
14459 --ctx->fmtcnt;
14460 keystart = ctx->fmtpos;
14461 /* Skip over balanced parentheses */
14462 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14463 arg->ch = FORMAT_READ(ctx);
14464 if (arg->ch == ')')
14465 --pcount;
14466 else if (arg->ch == '(')
14467 ++pcount;
14468 ctx->fmtpos++;
14469 }
14470 keylen = ctx->fmtpos - keystart - 1;
14471 if (ctx->fmtcnt < 0 || pcount > 0) {
14472 PyErr_SetString(PyExc_ValueError,
14473 "incomplete format key");
14474 return -1;
14475 }
14476 key = PyUnicode_Substring(ctx->fmtstr,
14477 keystart, keystart + keylen);
14478 if (key == NULL)
14479 return -1;
14480 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014481 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014482 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014483 }
14484 ctx->args = PyObject_GetItem(ctx->dict, key);
14485 Py_DECREF(key);
14486 if (ctx->args == NULL)
14487 return -1;
14488 ctx->args_owned = 1;
14489 ctx->arglen = -1;
14490 ctx->argidx = -2;
14491 }
14492
14493 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014494 while (--ctx->fmtcnt >= 0) {
14495 arg->ch = FORMAT_READ(ctx);
14496 ctx->fmtpos++;
14497 switch (arg->ch) {
14498 case '-': arg->flags |= F_LJUST; continue;
14499 case '+': arg->flags |= F_SIGN; continue;
14500 case ' ': arg->flags |= F_BLANK; continue;
14501 case '#': arg->flags |= F_ALT; continue;
14502 case '0': arg->flags |= F_ZERO; continue;
14503 }
14504 break;
14505 }
14506
14507 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014508 if (arg->ch == '*') {
14509 v = unicode_format_getnextarg(ctx);
14510 if (v == NULL)
14511 return -1;
14512 if (!PyLong_Check(v)) {
14513 PyErr_SetString(PyExc_TypeError,
14514 "* wants int");
14515 return -1;
14516 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014517 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014518 if (arg->width == -1 && PyErr_Occurred())
14519 return -1;
14520 if (arg->width < 0) {
14521 arg->flags |= F_LJUST;
14522 arg->width = -arg->width;
14523 }
14524 if (--ctx->fmtcnt >= 0) {
14525 arg->ch = FORMAT_READ(ctx);
14526 ctx->fmtpos++;
14527 }
14528 }
14529 else if (arg->ch >= '0' && arg->ch <= '9') {
14530 arg->width = arg->ch - '0';
14531 while (--ctx->fmtcnt >= 0) {
14532 arg->ch = FORMAT_READ(ctx);
14533 ctx->fmtpos++;
14534 if (arg->ch < '0' || arg->ch > '9')
14535 break;
14536 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14537 mixing signed and unsigned comparison. Since arg->ch is between
14538 '0' and '9', casting to int is safe. */
14539 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14540 PyErr_SetString(PyExc_ValueError,
14541 "width too big");
14542 return -1;
14543 }
14544 arg->width = arg->width*10 + (arg->ch - '0');
14545 }
14546 }
14547
14548 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014549 if (arg->ch == '.') {
14550 arg->prec = 0;
14551 if (--ctx->fmtcnt >= 0) {
14552 arg->ch = FORMAT_READ(ctx);
14553 ctx->fmtpos++;
14554 }
14555 if (arg->ch == '*') {
14556 v = unicode_format_getnextarg(ctx);
14557 if (v == NULL)
14558 return -1;
14559 if (!PyLong_Check(v)) {
14560 PyErr_SetString(PyExc_TypeError,
14561 "* wants int");
14562 return -1;
14563 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014564 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014565 if (arg->prec == -1 && PyErr_Occurred())
14566 return -1;
14567 if (arg->prec < 0)
14568 arg->prec = 0;
14569 if (--ctx->fmtcnt >= 0) {
14570 arg->ch = FORMAT_READ(ctx);
14571 ctx->fmtpos++;
14572 }
14573 }
14574 else if (arg->ch >= '0' && arg->ch <= '9') {
14575 arg->prec = arg->ch - '0';
14576 while (--ctx->fmtcnt >= 0) {
14577 arg->ch = FORMAT_READ(ctx);
14578 ctx->fmtpos++;
14579 if (arg->ch < '0' || arg->ch > '9')
14580 break;
14581 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14582 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014583 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014584 return -1;
14585 }
14586 arg->prec = arg->prec*10 + (arg->ch - '0');
14587 }
14588 }
14589 }
14590
14591 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14592 if (ctx->fmtcnt >= 0) {
14593 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14594 if (--ctx->fmtcnt >= 0) {
14595 arg->ch = FORMAT_READ(ctx);
14596 ctx->fmtpos++;
14597 }
14598 }
14599 }
14600 if (ctx->fmtcnt < 0) {
14601 PyErr_SetString(PyExc_ValueError,
14602 "incomplete format");
14603 return -1;
14604 }
14605 return 0;
14606
14607#undef FORMAT_READ
14608}
14609
14610/* Format one argument. Supported conversion specifiers:
14611
14612 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014613 - "i", "d", "u": int or float
14614 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014615 - "e", "E", "f", "F", "g", "G": float
14616 - "c": int or str (1 character)
14617
Victor Stinner8dbd4212012-12-04 09:30:24 +010014618 When possible, the output is written directly into the Unicode writer
14619 (ctx->writer). A string is created when padding is required.
14620
Victor Stinnera47082312012-10-04 02:19:54 +020014621 Return 0 if the argument has been formatted into *p_str,
14622 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014623 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014624static int
14625unicode_format_arg_format(struct unicode_formatter_t *ctx,
14626 struct unicode_format_arg_t *arg,
14627 PyObject **p_str)
14628{
14629 PyObject *v;
14630 _PyUnicodeWriter *writer = &ctx->writer;
14631
14632 if (ctx->fmtcnt == 0)
14633 ctx->writer.overallocate = 0;
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635 v = unicode_format_getnextarg(ctx);
14636 if (v == NULL)
14637 return -1;
14638
Victor Stinnera47082312012-10-04 02:19:54 +020014639
14640 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014641 case 's':
14642 case 'r':
14643 case 'a':
14644 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14645 /* Fast path */
14646 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14647 return -1;
14648 return 1;
14649 }
14650
14651 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14652 *p_str = v;
14653 Py_INCREF(*p_str);
14654 }
14655 else {
14656 if (arg->ch == 's')
14657 *p_str = PyObject_Str(v);
14658 else if (arg->ch == 'r')
14659 *p_str = PyObject_Repr(v);
14660 else
14661 *p_str = PyObject_ASCII(v);
14662 }
14663 break;
14664
14665 case 'i':
14666 case 'd':
14667 case 'u':
14668 case 'o':
14669 case 'x':
14670 case 'X':
14671 {
14672 int ret = mainformatlong(v, arg, p_str, writer);
14673 if (ret != 0)
14674 return ret;
14675 arg->sign = 1;
14676 break;
14677 }
14678
14679 case 'e':
14680 case 'E':
14681 case 'f':
14682 case 'F':
14683 case 'g':
14684 case 'G':
14685 if (arg->width == -1 && arg->prec == -1
14686 && !(arg->flags & (F_SIGN | F_BLANK)))
14687 {
14688 /* Fast path */
14689 if (formatfloat(v, arg, NULL, writer) == -1)
14690 return -1;
14691 return 1;
14692 }
14693
14694 arg->sign = 1;
14695 if (formatfloat(v, arg, p_str, NULL) == -1)
14696 return -1;
14697 break;
14698
14699 case 'c':
14700 {
14701 Py_UCS4 ch = formatchar(v);
14702 if (ch == (Py_UCS4) -1)
14703 return -1;
14704 if (arg->width == -1 && arg->prec == -1) {
14705 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014706 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014707 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014708 return 1;
14709 }
14710 *p_str = PyUnicode_FromOrdinal(ch);
14711 break;
14712 }
14713
14714 default:
14715 PyErr_Format(PyExc_ValueError,
14716 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014717 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014718 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14719 (int)arg->ch,
14720 ctx->fmtpos - 1);
14721 return -1;
14722 }
14723 if (*p_str == NULL)
14724 return -1;
14725 assert (PyUnicode_Check(*p_str));
14726 return 0;
14727}
14728
14729static int
14730unicode_format_arg_output(struct unicode_formatter_t *ctx,
14731 struct unicode_format_arg_t *arg,
14732 PyObject *str)
14733{
14734 Py_ssize_t len;
14735 enum PyUnicode_Kind kind;
14736 void *pbuf;
14737 Py_ssize_t pindex;
14738 Py_UCS4 signchar;
14739 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014740 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014741 Py_ssize_t sublen;
14742 _PyUnicodeWriter *writer = &ctx->writer;
14743 Py_UCS4 fill;
14744
14745 fill = ' ';
14746 if (arg->sign && arg->flags & F_ZERO)
14747 fill = '0';
14748
14749 if (PyUnicode_READY(str) == -1)
14750 return -1;
14751
14752 len = PyUnicode_GET_LENGTH(str);
14753 if ((arg->width == -1 || arg->width <= len)
14754 && (arg->prec == -1 || arg->prec >= len)
14755 && !(arg->flags & (F_SIGN | F_BLANK)))
14756 {
14757 /* Fast path */
14758 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14759 return -1;
14760 return 0;
14761 }
14762
14763 /* Truncate the string for "s", "r" and "a" formats
14764 if the precision is set */
14765 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14766 if (arg->prec >= 0 && len > arg->prec)
14767 len = arg->prec;
14768 }
14769
14770 /* Adjust sign and width */
14771 kind = PyUnicode_KIND(str);
14772 pbuf = PyUnicode_DATA(str);
14773 pindex = 0;
14774 signchar = '\0';
14775 if (arg->sign) {
14776 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14777 if (ch == '-' || ch == '+') {
14778 signchar = ch;
14779 len--;
14780 pindex++;
14781 }
14782 else if (arg->flags & F_SIGN)
14783 signchar = '+';
14784 else if (arg->flags & F_BLANK)
14785 signchar = ' ';
14786 else
14787 arg->sign = 0;
14788 }
14789 if (arg->width < len)
14790 arg->width = len;
14791
14792 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014793 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014794 if (!(arg->flags & F_LJUST)) {
14795 if (arg->sign) {
14796 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014797 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014798 }
14799 else {
14800 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014801 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014802 }
14803 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014804 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14805 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014806 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014807 }
14808
Victor Stinnera47082312012-10-04 02:19:54 +020014809 buflen = arg->width;
14810 if (arg->sign && len == arg->width)
14811 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014813 return -1;
14814
14815 /* Write the sign if needed */
14816 if (arg->sign) {
14817 if (fill != ' ') {
14818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14819 writer->pos += 1;
14820 }
14821 if (arg->width > len)
14822 arg->width--;
14823 }
14824
14825 /* Write the numeric prefix for "x", "X" and "o" formats
14826 if the alternate form is used.
14827 For example, write "0x" for the "%#x" format. */
14828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14830 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14831 if (fill != ' ') {
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14834 writer->pos += 2;
14835 pindex += 2;
14836 }
14837 arg->width -= 2;
14838 if (arg->width < 0)
14839 arg->width = 0;
14840 len -= 2;
14841 }
14842
14843 /* Pad left with the fill character if needed */
14844 if (arg->width > len && !(arg->flags & F_LJUST)) {
14845 sublen = arg->width - len;
14846 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14847 writer->pos += sublen;
14848 arg->width = len;
14849 }
14850
14851 /* If padding with spaces: write sign if needed and/or numeric prefix if
14852 the alternate form is used */
14853 if (fill == ' ') {
14854 if (arg->sign) {
14855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14856 writer->pos += 1;
14857 }
14858 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14860 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14863 writer->pos += 2;
14864 pindex += 2;
14865 }
14866 }
14867
14868 /* Write characters */
14869 if (len) {
14870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14871 str, pindex, len);
14872 writer->pos += len;
14873 }
14874
14875 /* Pad right with the fill character if needed */
14876 if (arg->width > len) {
14877 sublen = arg->width - len;
14878 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14879 writer->pos += sublen;
14880 }
14881 return 0;
14882}
14883
14884/* Helper of PyUnicode_Format(): format one arg.
14885 Return 0 on success, raise an exception and return -1 on error. */
14886static int
14887unicode_format_arg(struct unicode_formatter_t *ctx)
14888{
14889 struct unicode_format_arg_t arg;
14890 PyObject *str;
14891 int ret;
14892
Victor Stinner8dbd4212012-12-04 09:30:24 +010014893 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014894 if (arg.ch == '%') {
14895 ctx->fmtpos++;
14896 ctx->fmtcnt--;
14897 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14898 return -1;
14899 return 0;
14900 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014901 arg.flags = 0;
14902 arg.width = -1;
14903 arg.prec = -1;
14904 arg.sign = 0;
14905 str = NULL;
14906
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ret = unicode_format_arg_parse(ctx, &arg);
14908 if (ret == -1)
14909 return -1;
14910
14911 ret = unicode_format_arg_format(ctx, &arg, &str);
14912 if (ret == -1)
14913 return -1;
14914
14915 if (ret != 1) {
14916 ret = unicode_format_arg_output(ctx, &arg, str);
14917 Py_DECREF(str);
14918 if (ret == -1)
14919 return -1;
14920 }
14921
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014922 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014923 PyErr_SetString(PyExc_TypeError,
14924 "not all arguments converted during string formatting");
14925 return -1;
14926 }
14927 return 0;
14928}
14929
Alexander Belopolsky40018472011-02-26 01:02:56 +000014930PyObject *
14931PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014932{
Victor Stinnera47082312012-10-04 02:19:54 +020014933 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014934
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014936 PyErr_BadInternalCall();
14937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938 }
Victor Stinnera47082312012-10-04 02:19:54 +020014939
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014940 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014941 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014942
14943 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014944 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14945 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14946 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14947 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014948
Victor Stinner8f674cc2013-04-17 23:02:17 +020014949 _PyUnicodeWriter_Init(&ctx.writer);
14950 ctx.writer.min_length = ctx.fmtcnt + 100;
14951 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014952
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014954 ctx.arglen = PyTuple_Size(args);
14955 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956 }
14957 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014958 ctx.arglen = -1;
14959 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960 }
Victor Stinnera47082312012-10-04 02:19:54 +020014961 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014962 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014963 ctx.dict = args;
14964 else
14965 ctx.dict = NULL;
14966 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967
Victor Stinnera47082312012-10-04 02:19:54 +020014968 while (--ctx.fmtcnt >= 0) {
14969 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014970 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014971
14972 nonfmtpos = ctx.fmtpos++;
14973 while (ctx.fmtcnt >= 0 &&
14974 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14975 ctx.fmtpos++;
14976 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 }
Victor Stinnera47082312012-10-04 02:19:54 +020014978 if (ctx.fmtcnt < 0) {
14979 ctx.fmtpos--;
14980 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014981 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014982
Victor Stinnercfc4c132013-04-03 01:48:39 +020014983 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14984 nonfmtpos, ctx.fmtpos) < 0)
14985 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 }
14987 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014988 ctx.fmtpos++;
14989 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014990 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014991 }
14992 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014993
Victor Stinnera47082312012-10-04 02:19:54 +020014994 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014995 PyErr_SetString(PyExc_TypeError,
14996 "not all arguments converted during string formatting");
14997 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014998 }
14999
Victor Stinnera47082312012-10-04 02:19:54 +020015000 if (ctx.args_owned) {
15001 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015002 }
Victor Stinnera47082312012-10-04 02:19:54 +020015003 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004
Benjamin Peterson29060642009-01-31 22:14:21 +000015005 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015006 _PyUnicodeWriter_Dealloc(&ctx.writer);
15007 if (ctx.args_owned) {
15008 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015009 }
15010 return NULL;
15011}
15012
Jeremy Hylton938ace62002-07-17 16:30:39 +000015013static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015014unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15015
Tim Peters6d6c1a32001-08-02 04:15:00 +000015016static PyObject *
15017unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15018{
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015020 static char *kwlist[] = {"object", "encoding", "errors", 0};
15021 char *encoding = NULL;
15022 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015023
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 if (type != &PyUnicode_Type)
15025 return unicode_subtype_new(type, args, kwds);
15026 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 return NULL;
15029 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015030 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 if (encoding == NULL && errors == NULL)
15032 return PyObject_Str(x);
15033 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015034 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015035}
15036
Guido van Rossume023fe02001-08-30 03:12:59 +000015037static PyObject *
15038unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15039{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015040 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041 Py_ssize_t length, char_size;
15042 int share_wstr, share_utf8;
15043 unsigned int kind;
15044 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015045
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015048 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015051 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015052 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015053 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015057 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 if (self == NULL) {
15059 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 return NULL;
15061 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 kind = PyUnicode_KIND(unicode);
15063 length = PyUnicode_GET_LENGTH(unicode);
15064
15065 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015066#ifdef Py_DEBUG
15067 _PyUnicode_HASH(self) = -1;
15068#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015070#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071 _PyUnicode_STATE(self).interned = 0;
15072 _PyUnicode_STATE(self).kind = kind;
15073 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015074 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 _PyUnicode_STATE(self).ready = 1;
15076 _PyUnicode_WSTR(self) = NULL;
15077 _PyUnicode_UTF8_LENGTH(self) = 0;
15078 _PyUnicode_UTF8(self) = NULL;
15079 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015080 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015081
15082 share_utf8 = 0;
15083 share_wstr = 0;
15084 if (kind == PyUnicode_1BYTE_KIND) {
15085 char_size = 1;
15086 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15087 share_utf8 = 1;
15088 }
15089 else if (kind == PyUnicode_2BYTE_KIND) {
15090 char_size = 2;
15091 if (sizeof(wchar_t) == 2)
15092 share_wstr = 1;
15093 }
15094 else {
15095 assert(kind == PyUnicode_4BYTE_KIND);
15096 char_size = 4;
15097 if (sizeof(wchar_t) == 4)
15098 share_wstr = 1;
15099 }
15100
15101 /* Ensure we won't overflow the length. */
15102 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15103 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015104 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015106 data = PyObject_MALLOC((length + 1) * char_size);
15107 if (data == NULL) {
15108 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015109 goto onError;
15110 }
15111
Victor Stinnerc3c74152011-10-02 20:39:55 +020015112 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015113 if (share_utf8) {
15114 _PyUnicode_UTF8_LENGTH(self) = length;
15115 _PyUnicode_UTF8(self) = data;
15116 }
15117 if (share_wstr) {
15118 _PyUnicode_WSTR_LENGTH(self) = length;
15119 _PyUnicode_WSTR(self) = (wchar_t *)data;
15120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015121
Christian Heimesf051e432016-09-13 20:22:02 +020015122 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015123 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015124 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015125#ifdef Py_DEBUG
15126 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15127#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015128 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015129 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015130
15131onError:
15132 Py_DECREF(unicode);
15133 Py_DECREF(self);
15134 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015135}
15136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015137PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015138"str(object='') -> str\n\
15139str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015140\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015141Create a new string object from the given object. If encoding or\n\
15142errors is specified, then the object must expose a data buffer\n\
15143that will be decoded using the given encoding and error handler.\n\
15144Otherwise, returns the result of object.__str__() (if defined)\n\
15145or repr(object).\n\
15146encoding defaults to sys.getdefaultencoding().\n\
15147errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015148
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015149static PyObject *unicode_iter(PyObject *seq);
15150
Guido van Rossumd57fd912000-03-10 22:53:23 +000015151PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015152 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 "str", /* tp_name */
15154 sizeof(PyUnicodeObject), /* tp_size */
15155 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015156 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 (destructor)unicode_dealloc, /* tp_dealloc */
15158 0, /* tp_print */
15159 0, /* tp_getattr */
15160 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015161 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015162 unicode_repr, /* tp_repr */
15163 &unicode_as_number, /* tp_as_number */
15164 &unicode_as_sequence, /* tp_as_sequence */
15165 &unicode_as_mapping, /* tp_as_mapping */
15166 (hashfunc) unicode_hash, /* tp_hash*/
15167 0, /* tp_call*/
15168 (reprfunc) unicode_str, /* tp_str */
15169 PyObject_GenericGetAttr, /* tp_getattro */
15170 0, /* tp_setattro */
15171 0, /* tp_as_buffer */
15172 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015173 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015174 unicode_doc, /* tp_doc */
15175 0, /* tp_traverse */
15176 0, /* tp_clear */
15177 PyUnicode_RichCompare, /* tp_richcompare */
15178 0, /* tp_weaklistoffset */
15179 unicode_iter, /* tp_iter */
15180 0, /* tp_iternext */
15181 unicode_methods, /* tp_methods */
15182 0, /* tp_members */
15183 0, /* tp_getset */
15184 &PyBaseObject_Type, /* tp_base */
15185 0, /* tp_dict */
15186 0, /* tp_descr_get */
15187 0, /* tp_descr_set */
15188 0, /* tp_dictoffset */
15189 0, /* tp_init */
15190 0, /* tp_alloc */
15191 unicode_new, /* tp_new */
15192 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015193};
15194
15195/* Initialize the Unicode implementation */
15196
Victor Stinner3a50e702011-10-18 21:21:00 +020015197int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015198{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015199 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015200 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015201 0x000A, /* LINE FEED */
15202 0x000D, /* CARRIAGE RETURN */
15203 0x001C, /* FILE SEPARATOR */
15204 0x001D, /* GROUP SEPARATOR */
15205 0x001E, /* RECORD SEPARATOR */
15206 0x0085, /* NEXT LINE */
15207 0x2028, /* LINE SEPARATOR */
15208 0x2029, /* PARAGRAPH SEPARATOR */
15209 };
15210
Fred Drakee4315f52000-05-09 19:53:39 +000015211 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015212 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015213 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015214 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015215 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015216
Guido van Rossumcacfc072002-05-24 19:01:59 +000015217 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015218 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015219
15220 /* initialize the linebreak bloom filter */
15221 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015222 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015223 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015224
Christian Heimes26532f72013-07-20 14:57:16 +020015225 if (PyType_Ready(&EncodingMapType) < 0)
15226 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015227
Benjamin Petersonc4311282012-10-30 23:21:10 -040015228 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15229 Py_FatalError("Can't initialize field name iterator type");
15230
15231 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15232 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015233
Victor Stinner3a50e702011-10-18 21:21:00 +020015234 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015235}
15236
15237/* Finalize the Unicode implementation */
15238
Christian Heimesa156e092008-02-16 07:38:31 +000015239int
15240PyUnicode_ClearFreeList(void)
15241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015243}
15244
Guido van Rossumd57fd912000-03-10 22:53:23 +000015245void
Thomas Wouters78890102000-07-22 19:25:51 +000015246_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015247{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015248 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015249
Serhiy Storchaka05997252013-01-26 12:14:02 +020015250 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015251
Serhiy Storchaka05997252013-01-26 12:14:02 +020015252 for (i = 0; i < 256; i++)
15253 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015254 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015255 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015256}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015257
Walter Dörwald16807132007-05-25 13:52:07 +000015258void
15259PyUnicode_InternInPlace(PyObject **p)
15260{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015261 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015263#ifdef Py_DEBUG
15264 assert(s != NULL);
15265 assert(_PyUnicode_CHECK(s));
15266#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015268 return;
15269#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 /* If it's a subclass, we don't really know what putting
15271 it in the interned dict might do. */
15272 if (!PyUnicode_CheckExact(s))
15273 return;
15274 if (PyUnicode_CHECK_INTERNED(s))
15275 return;
15276 if (interned == NULL) {
15277 interned = PyDict_New();
15278 if (interned == NULL) {
15279 PyErr_Clear(); /* Don't leave an exception */
15280 return;
15281 }
15282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015284 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015285 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015286 if (t == NULL) {
15287 PyErr_Clear();
15288 return;
15289 }
15290 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015291 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015292 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015293 return;
15294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 /* The two references in interned are not counted by refcnt.
15296 The deallocator will take care of this */
15297 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015298 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015299}
15300
15301void
15302PyUnicode_InternImmortal(PyObject **p)
15303{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 PyUnicode_InternInPlace(p);
15305 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015306 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 Py_INCREF(*p);
15308 }
Walter Dörwald16807132007-05-25 13:52:07 +000015309}
15310
15311PyObject *
15312PyUnicode_InternFromString(const char *cp)
15313{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 PyObject *s = PyUnicode_FromString(cp);
15315 if (s == NULL)
15316 return NULL;
15317 PyUnicode_InternInPlace(&s);
15318 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015319}
15320
Alexander Belopolsky40018472011-02-26 01:02:56 +000015321void
15322_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015325 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 Py_ssize_t i, n;
15327 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015328
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 if (interned == NULL || !PyDict_Check(interned))
15330 return;
15331 keys = PyDict_Keys(interned);
15332 if (keys == NULL || !PyList_Check(keys)) {
15333 PyErr_Clear();
15334 return;
15335 }
Walter Dörwald16807132007-05-25 13:52:07 +000015336
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15338 detector, interned unicode strings are not forcibly deallocated;
15339 rather, we give them their stolen references back, and then clear
15340 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015341
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 n = PyList_GET_SIZE(keys);
15343 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015344 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015346 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015347 if (PyUnicode_READY(s) == -1) {
15348 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015349 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 case SSTATE_NOT_INTERNED:
15353 /* XXX Shouldn't happen */
15354 break;
15355 case SSTATE_INTERNED_IMMORTAL:
15356 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015357 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 break;
15359 case SSTATE_INTERNED_MORTAL:
15360 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015361 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 break;
15363 default:
15364 Py_FatalError("Inconsistent interned string state.");
15365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015366 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 }
15368 fprintf(stderr, "total size of all interned strings: "
15369 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15370 "mortal/immortal\n", mortal_size, immortal_size);
15371 Py_DECREF(keys);
15372 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015373 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015374}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015375
15376
15377/********************* Unicode Iterator **************************/
15378
15379typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 PyObject_HEAD
15381 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015382 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015383} unicodeiterobject;
15384
15385static void
15386unicodeiter_dealloc(unicodeiterobject *it)
15387{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 _PyObject_GC_UNTRACK(it);
15389 Py_XDECREF(it->it_seq);
15390 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015391}
15392
15393static int
15394unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 Py_VISIT(it->it_seq);
15397 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015398}
15399
15400static PyObject *
15401unicodeiter_next(unicodeiterobject *it)
15402{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015403 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015404
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 assert(it != NULL);
15406 seq = it->it_seq;
15407 if (seq == NULL)
15408 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015409 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015411 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15412 int kind = PyUnicode_KIND(seq);
15413 void *data = PyUnicode_DATA(seq);
15414 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15415 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 if (item != NULL)
15417 ++it->it_index;
15418 return item;
15419 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015420
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015422 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424}
15425
15426static PyObject *
15427unicodeiter_len(unicodeiterobject *it)
15428{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 Py_ssize_t len = 0;
15430 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015431 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015433}
15434
15435PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15436
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015437static PyObject *
15438unicodeiter_reduce(unicodeiterobject *it)
15439{
15440 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015441 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015442 it->it_seq, it->it_index);
15443 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015444 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015445 if (u == NULL)
15446 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015447 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448 }
15449}
15450
15451PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15452
15453static PyObject *
15454unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15455{
15456 Py_ssize_t index = PyLong_AsSsize_t(state);
15457 if (index == -1 && PyErr_Occurred())
15458 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015459 if (it->it_seq != NULL) {
15460 if (index < 0)
15461 index = 0;
15462 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15463 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15464 it->it_index = index;
15465 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015466 Py_RETURN_NONE;
15467}
15468
15469PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15470
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015471static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015472 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015473 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015474 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15475 reduce_doc},
15476 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15477 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015479};
15480
15481PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015482 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15483 "str_iterator", /* tp_name */
15484 sizeof(unicodeiterobject), /* tp_basicsize */
15485 0, /* tp_itemsize */
15486 /* methods */
15487 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15488 0, /* tp_print */
15489 0, /* tp_getattr */
15490 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015491 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015492 0, /* tp_repr */
15493 0, /* tp_as_number */
15494 0, /* tp_as_sequence */
15495 0, /* tp_as_mapping */
15496 0, /* tp_hash */
15497 0, /* tp_call */
15498 0, /* tp_str */
15499 PyObject_GenericGetAttr, /* tp_getattro */
15500 0, /* tp_setattro */
15501 0, /* tp_as_buffer */
15502 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15503 0, /* tp_doc */
15504 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15505 0, /* tp_clear */
15506 0, /* tp_richcompare */
15507 0, /* tp_weaklistoffset */
15508 PyObject_SelfIter, /* tp_iter */
15509 (iternextfunc)unicodeiter_next, /* tp_iternext */
15510 unicodeiter_methods, /* tp_methods */
15511 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015512};
15513
15514static PyObject *
15515unicode_iter(PyObject *seq)
15516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015518
Benjamin Peterson14339b62009-01-31 16:36:08 +000015519 if (!PyUnicode_Check(seq)) {
15520 PyErr_BadInternalCall();
15521 return NULL;
15522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015523 if (PyUnicode_READY(seq) == -1)
15524 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15526 if (it == NULL)
15527 return NULL;
15528 it->it_index = 0;
15529 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015530 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 _PyObject_GC_TRACK(it);
15532 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015533}
15534
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015535
15536size_t
15537Py_UNICODE_strlen(const Py_UNICODE *u)
15538{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015539 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015540}
15541
15542Py_UNICODE*
15543Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15544{
15545 Py_UNICODE *u = s1;
15546 while ((*u++ = *s2++));
15547 return s1;
15548}
15549
15550Py_UNICODE*
15551Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15552{
15553 Py_UNICODE *u = s1;
15554 while ((*u++ = *s2++))
15555 if (n-- == 0)
15556 break;
15557 return s1;
15558}
15559
15560Py_UNICODE*
15561Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15562{
15563 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015564 u1 += wcslen(u1);
15565 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015566 return s1;
15567}
15568
15569int
15570Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15571{
15572 while (*s1 && *s2 && *s1 == *s2)
15573 s1++, s2++;
15574 if (*s1 && *s2)
15575 return (*s1 < *s2) ? -1 : +1;
15576 if (*s1)
15577 return 1;
15578 if (*s2)
15579 return -1;
15580 return 0;
15581}
15582
15583int
15584Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15585{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015586 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015587 for (; n != 0; n--) {
15588 u1 = *s1;
15589 u2 = *s2;
15590 if (u1 != u2)
15591 return (u1 < u2) ? -1 : +1;
15592 if (u1 == '\0')
15593 return 0;
15594 s1++;
15595 s2++;
15596 }
15597 return 0;
15598}
15599
15600Py_UNICODE*
15601Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15602{
15603 const Py_UNICODE *p;
15604 for (p = s; *p; p++)
15605 if (*p == c)
15606 return (Py_UNICODE*)p;
15607 return NULL;
15608}
15609
15610Py_UNICODE*
15611Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15612{
15613 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015614 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015615 while (p != s) {
15616 p--;
15617 if (*p == c)
15618 return (Py_UNICODE*)p;
15619 }
15620 return NULL;
15621}
Victor Stinner331ea922010-08-10 16:37:20 +000015622
Victor Stinner71133ff2010-09-01 23:43:53 +000015623Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015624PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015625{
Victor Stinner577db2c2011-10-11 22:12:48 +020015626 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015627 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015629 if (!PyUnicode_Check(unicode)) {
15630 PyErr_BadArgument();
15631 return NULL;
15632 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015633 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015634 if (u == NULL)
15635 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015636 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015637 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015638 PyErr_NoMemory();
15639 return NULL;
15640 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015641 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015642 size *= sizeof(Py_UNICODE);
15643 copy = PyMem_Malloc(size);
15644 if (copy == NULL) {
15645 PyErr_NoMemory();
15646 return NULL;
15647 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015648 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015649 return copy;
15650}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015651
Georg Brandl66c221e2010-10-14 07:04:07 +000015652/* A _string module, to export formatter_parser and formatter_field_name_split
15653 to the string.Formatter class implemented in Python. */
15654
15655static PyMethodDef _string_methods[] = {
15656 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15657 METH_O, PyDoc_STR("split the argument as a field name")},
15658 {"formatter_parser", (PyCFunction) formatter_parser,
15659 METH_O, PyDoc_STR("parse the argument as a format string")},
15660 {NULL, NULL}
15661};
15662
15663static struct PyModuleDef _string_module = {
15664 PyModuleDef_HEAD_INIT,
15665 "_string",
15666 PyDoc_STR("string helper module"),
15667 0,
15668 _string_methods,
15669 NULL,
15670 NULL,
15671 NULL,
15672 NULL
15673};
15674
15675PyMODINIT_FUNC
15676PyInit__string(void)
15677{
15678 return PyModule_Create(&_string_module);
15679}
15680
15681
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015682#ifdef __cplusplus
15683}
15684#endif