blob: b711f0ccced77f0b883ca7dddc25dc5ba666acca [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957 convert a Unicode object to a wide character string.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) required to convert the unicode object. Ignore size argument.
2961
Victor Stinnerd88d9832011-09-06 02:00:05 +02002962 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002966unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002967 wchar_t *w,
2968 Py_ssize_t size)
2969{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 const wchar_t *wstr;
2972
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002973 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002974 if (wstr == NULL)
2975 return -1;
2976
Victor Stinner5593d8a2010-10-02 11:11:27 +00002977 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (size > res)
2979 size = res + 1;
2980 else
2981 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002982 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 return res;
2984 }
2985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002987}
2988
2989Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002990PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002991 wchar_t *w,
2992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyErr_BadInternalCall();
2996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002998 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999}
3000
Victor Stinner137c34c2010-09-29 10:25:54 +00003001wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003002PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 Py_ssize_t *size)
3004{
3005 wchar_t* buffer;
3006 Py_ssize_t buflen;
3007
3008 if (unicode == NULL) {
3009 PyErr_BadInternalCall();
3010 return NULL;
3011 }
3012
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003013 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 if (buflen == -1)
3015 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003016 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003021 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003022 if (buflen == -1) {
3023 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003025 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003026 if (size != NULL)
3027 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003028 return buffer;
3029}
3030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
3034PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035{
Victor Stinner8faf8212011-12-08 22:14:11 +01003036 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 PyErr_SetString(PyExc_ValueError,
3038 "chr() arg not in range(0x110000)");
3039 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003041
Victor Stinner985a82a2014-01-03 12:53:47 +01003042 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003046PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003051 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003052 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 Py_INCREF(obj);
3054 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
3056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 /* For a Unicode subtype that's not a Unicode object,
3058 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003059 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 PyErr_Format(PyExc_TypeError,
3062 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003063 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003064 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003068PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003072 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003073 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003074
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 PyErr_BadInternalCall();
3077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Decoding bytes objects is the most common case and should be fast */
3081 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003082 if (PyBytes_GET_SIZE(obj) == 0)
3083 _Py_RETURN_UNICODE_EMPTY();
3084 v = PyUnicode_Decode(
3085 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 return v;
3088 }
3089
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 PyErr_SetString(PyExc_TypeError,
3092 "decoding str is not supported");
3093 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003095
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003099 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 Py_TYPE(obj)->tp_name);
3101 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003102 }
Tim Petersced69f82003-09-16 20:30:58 +00003103
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003104 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003105 PyBuffer_Release(&buffer);
3106 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003108
Serhiy Storchaka05997252013-01-26 12:14:02 +02003109 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003111 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
Victor Stinnerebe17e02016-10-12 13:57:45 +02003114/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003117int
3118_Py_normalize_encoding(const char *encoding,
3119 char *lower,
3120 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003123 char *l;
3124 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003125 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126
Victor Stinner942889a2016-09-05 15:40:10 -07003127 assert(encoding != NULL);
3128
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003129 e = encoding;
3130 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003131 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003132 punct = 0;
3133 while (1) {
3134 char c = *e;
3135 if (c == 0) {
3136 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003137 }
Victor Stinner942889a2016-09-05 15:40:10 -07003138
3139 if (Py_ISALNUM(c) || c == '.') {
3140 if (punct && l != lower) {
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = '_';
3145 }
3146 punct = 0;
3147
3148 if (l == l_end) {
3149 return 0;
3150 }
3151 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
3153 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003154 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
Victor Stinner942889a2016-09-05 15:40:10 -07003156
3157 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003160 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 Py_ssize_t size,
3166 const char *encoding,
3167 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003168{
3169 PyObject *buffer = NULL, *unicode;
3170 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003171 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172
3173 if (encoding == NULL) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003176
Fred Drakee4315f52000-05-09 19:53:39 +00003177 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003178 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179 char *lower = buflower;
3180
3181 /* Fast paths */
3182 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183 lower += 3;
3184 if (*lower == '_') {
3185 /* Match "utf8" and "utf_8" */
3186 lower++;
3187 }
3188
3189 if (lower[0] == '8' && lower[1] == 0) {
3190 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191 }
3192 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194 }
3195 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197 }
3198 }
3199 else {
3200 if (strcmp(lower, "ascii") == 0
3201 || strcmp(lower, "us_ascii") == 0) {
3202 return PyUnicode_DecodeASCII(s, size, errors);
3203 }
Steve Dowercc16be82016-09-08 10:35:16 -07003204 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003205 else if (strcmp(lower, "mbcs") == 0) {
3206 return PyUnicode_DecodeMBCS(s, size, errors);
3207 }
3208 #endif
3209 else if (strcmp(lower, "latin1") == 0
3210 || strcmp(lower, "latin_1") == 0
3211 || strcmp(lower, "iso_8859_1") == 0
3212 || strcmp(lower, "iso8859_1") == 0) {
3213 return PyUnicode_DecodeLatin1(s, size, errors);
3214 }
3215 }
Victor Stinner37296e82010-06-10 13:36:23 +00003216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217
3218 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003219 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003220 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003221 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003222 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (buffer == NULL)
3224 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003225 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (unicode == NULL)
3227 goto onError;
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003230 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231 "use codecs.decode() to decode to arbitrary types",
3232 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003233 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_DECREF(unicode);
3235 goto onError;
3236 }
3237 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003238 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_XDECREF(buffer);
3242 return NULL;
3243}
3244
Alexander Belopolsky40018472011-02-26 01:02:56 +00003245PyObject *
3246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003247 const char *encoding,
3248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003252 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 }
3254
Serhiy Storchaka00939072016-10-27 21:05:49 +03003255 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256 "PyUnicode_AsDecodedObject() is deprecated; "
3257 "use PyCodec_Decode() to decode from str", 1) < 0)
3258 return NULL;
3259
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262
3263 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003264 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003265}
3266
Alexander Belopolsky40018472011-02-26 01:02:56 +00003267PyObject *
3268PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003269 const char *encoding,
3270 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003271{
3272 PyObject *v;
3273
3274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
3276 goto onError;
3277 }
3278
Serhiy Storchaka00939072016-10-27 21:05:49 +03003279 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280 "PyUnicode_AsDecodedUnicode() is deprecated; "
3281 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282 return NULL;
3283
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286
3287 /* Decode via the codec registry */
3288 v = PyCodec_Decode(unicode, encoding, errors);
3289 if (v == NULL)
3290 goto onError;
3291 if (!PyUnicode_Check(v)) {
3292 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003293 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294 "use codecs.decode() to decode to arbitrary types",
3295 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003296 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003297 Py_DECREF(v);
3298 goto onError;
3299 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003300 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003301
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 return NULL;
3304}
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 Py_ssize_t size,
3309 const char *encoding,
3310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311{
3312 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003314 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318 Py_DECREF(unicode);
3319 return v;
3320}
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322PyObject *
3323PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding,
3325 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326{
3327 PyObject *v;
3328
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 goto onError;
3332 }
3333
Serhiy Storchaka00939072016-10-27 21:05:49 +03003334 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335 "PyUnicode_AsEncodedObject() is deprecated; "
3336 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337 "or PyCodec_Encode() for generic encoding", 1) < 0)
3338 return NULL;
3339
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003340 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342
3343 /* Encode via the codec registry */
3344 v = PyCodec_Encode(unicode, encoding, errors);
3345 if (v == NULL)
3346 goto onError;
3347 return v;
3348
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003350 return NULL;
3351}
3352
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353static size_t
3354wcstombs_errorpos(const wchar_t *wstr)
3355{
3356 size_t len;
3357#if SIZEOF_WCHAR_T == 2
3358 wchar_t buf[3];
3359#else
3360 wchar_t buf[2];
3361#endif
3362 char outbuf[MB_LEN_MAX];
3363 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365#if SIZEOF_WCHAR_T == 2
3366 buf[2] = 0;
3367#else
3368 buf[1] = 0;
3369#endif
3370 start = wstr;
3371 while (*wstr != L'\0')
3372 {
3373 previous = wstr;
3374#if SIZEOF_WCHAR_T == 2
3375 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377 {
3378 buf[0] = wstr[0];
3379 buf[1] = wstr[1];
3380 wstr += 2;
3381 }
3382 else {
3383 buf[0] = *wstr;
3384 buf[1] = 0;
3385 wstr++;
3386 }
3387#else
3388 buf[0] = *wstr;
3389 wstr++;
3390#endif
3391 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003392 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 }
3395
3396 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return 0;
3398}
3399
Victor Stinner1b579672011-12-17 05:47:23 +01003400static int
3401locale_error_handler(const char *errors, int *surrogateescape)
3402{
Victor Stinner50149202015-09-22 00:26:54 +02003403 _Py_error_handler error_handler = get_error_handler(errors);
3404 switch (error_handler)
3405 {
3406 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 0;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003410 *surrogateescape = 1;
3411 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003412 default:
3413 PyErr_Format(PyExc_ValueError,
3414 "only 'strict' and 'surrogateescape' error handlers "
3415 "are supported, not '%s'",
3416 errors);
3417 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003418 }
Victor Stinner1b579672011-12-17 05:47:23 +01003419}
3420
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003422PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423{
3424 Py_ssize_t wlen, wlen2;
3425 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003427 PyObject *bytes, *reason, *exc;
3428 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003429 int surrogateescape;
3430
3431 if (locale_error_handler(errors, &surrogateescape) < 0)
3432 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433
3434 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435 if (wstr == NULL)
3436 return NULL;
3437
3438 wlen2 = wcslen(wstr);
3439 if (wlen2 != wlen) {
3440 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003441 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 return NULL;
3443 }
3444
3445 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003446 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 char *str;
3448
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003449 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 if (str == NULL) {
3451 if (error_pos == (size_t)-1) {
3452 PyErr_NoMemory();
3453 PyMem_Free(wstr);
3454 return NULL;
3455 }
3456 else {
3457 goto encode_error;
3458 }
3459 }
3460 PyMem_Free(wstr);
3461
3462 bytes = PyBytes_FromString(str);
3463 PyMem_Free(str);
3464 }
3465 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 size_t len, len2;
3468
3469 len = wcstombs(NULL, wstr, 0);
3470 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003471 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 goto encode_error;
3473 }
3474
3475 bytes = PyBytes_FromStringAndSize(NULL, len);
3476 if (bytes == NULL) {
3477 PyMem_Free(wstr);
3478 return NULL;
3479 }
3480
3481 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003483 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003485 goto encode_error;
3486 }
3487 PyMem_Free(wstr);
3488 }
3489 return bytes;
3490
3491encode_error:
3492 errmsg = strerror(errno);
3493 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003494
3495 if (error_pos == (size_t)-1)
3496 error_pos = wcstombs_errorpos(wstr);
3497
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500 wstr = Py_DecodeLocale(errmsg, &errlen);
3501 if (wstr != NULL) {
3502 reason = PyUnicode_FromWideChar(wstr, errlen);
3503 PyMem_RawFree(wstr);
3504 } else {
3505 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003506 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003507
Victor Stinner2f197072011-12-17 07:08:30 +01003508 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003509 reason = PyUnicode_FromString(
3510 "wcstombs() encountered an unencodable "
3511 "wide character");
3512 if (reason == NULL)
3513 return NULL;
3514
3515 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516 "locale", unicode,
3517 (Py_ssize_t)error_pos,
3518 (Py_ssize_t)(error_pos+1),
3519 reason);
3520 Py_DECREF(reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003523 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003524 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 return NULL;
3526}
3527
Victor Stinnerad158722010-10-27 00:25:46 +00003528PyObject *
3529PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530{
Steve Dowercc16be82016-09-08 10:35:16 -07003531#if defined(__APPLE__)
3532 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003533#else
Victor Stinner793b5312011-04-27 00:24:21 +02003534 PyInterpreterState *interp = PyThreadState_GET()->interp;
3535 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536 cannot use it to encode and decode filenames before it is loaded. Load
3537 the Python codec requires to encode at least its own filename. Use the C
3538 version of the locale codec until the codec registry is initialized and
3539 the Python codec is loaded.
3540
3541 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542 cannot only rely on it: check also interp->fscodec_initialized for
3543 subinterpreters. */
3544 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545 return PyUnicode_AsEncodedString(unicode,
3546 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003547 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
3549 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003550 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003551 }
Victor Stinnerad158722010-10-27 00:25:46 +00003552#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555PyObject *
3556PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003557 const char *encoding,
3558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559{
3560 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003561 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Fred Drakee4315f52000-05-09 19:53:39 +00003567
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (encoding == NULL) {
3569 return _PyUnicode_AsUTF8String(unicode, errors);
3570 }
3571
Fred Drakee4315f52000-05-09 19:53:39 +00003572 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003573 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574 char *lower = buflower;
3575
3576 /* Fast paths */
3577 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578 lower += 3;
3579 if (*lower == '_') {
3580 /* Match "utf8" and "utf_8" */
3581 lower++;
3582 }
3583
3584 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003586 }
3587 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589 }
3590 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003593 }
Victor Stinner942889a2016-09-05 15:40:10 -07003594 else {
3595 if (strcmp(lower, "ascii") == 0
3596 || strcmp(lower, "us_ascii") == 0) {
3597 return _PyUnicode_AsASCIIString(unicode, errors);
3598 }
Steve Dowercc16be82016-09-08 10:35:16 -07003599#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003600 else if (strcmp(lower, "mbcs") == 0) {
3601 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003603#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003604 else if (strcmp(lower, "latin1") == 0 ||
3605 strcmp(lower, "latin_1") == 0 ||
3606 strcmp(lower, "iso_8859_1") == 0 ||
3607 strcmp(lower, "iso8859_1") == 0) {
3608 return _PyUnicode_AsLatin1String(unicode, errors);
3609 }
3610 }
Victor Stinner37296e82010-06-10 13:36:23 +00003611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612
3613 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003614 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 return NULL;
3617
3618 /* The normal path */
3619 if (PyBytes_Check(v))
3620 return v;
3621
3622 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003624 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003626
3627 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003628 "encoder %s returned bytearray instead of bytes; "
3629 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003630 encoding);
3631 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003632 Py_DECREF(v);
3633 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003636 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3637 Py_DECREF(v);
3638 return b;
3639 }
3640
3641 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003642 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3643 "use codecs.encode() to encode to arbitrary types",
3644 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003645 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003646 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 return NULL;
3648}
3649
Alexander Belopolsky40018472011-02-26 01:02:56 +00003650PyObject *
3651PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003652 const char *encoding,
3653 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003654{
3655 PyObject *v;
3656
3657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
3659 goto onError;
3660 }
3661
Serhiy Storchaka00939072016-10-27 21:05:49 +03003662 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3663 "PyUnicode_AsEncodedUnicode() is deprecated; "
3664 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3665 return NULL;
3666
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003667 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003669
3670 /* Encode via the codec registry */
3671 v = PyCodec_Encode(unicode, encoding, errors);
3672 if (v == NULL)
3673 goto onError;
3674 if (!PyUnicode_Check(v)) {
3675 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003676 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3677 "use codecs.encode() to encode to arbitrary types",
3678 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003679 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003680 Py_DECREF(v);
3681 goto onError;
3682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return NULL;
3687}
3688
Victor Stinner2f197072011-12-17 07:08:30 +01003689static size_t
3690mbstowcs_errorpos(const char *str, size_t len)
3691{
3692#ifdef HAVE_MBRTOWC
3693 const char *start = str;
3694 mbstate_t mbs;
3695 size_t converted;
3696 wchar_t ch;
3697
3698 memset(&mbs, 0, sizeof mbs);
3699 while (len)
3700 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003701 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003702 if (converted == 0)
3703 /* Reached end of string */
3704 break;
3705 if (converted == (size_t)-1 || converted == (size_t)-2) {
3706 /* Conversion error or incomplete character */
3707 return str - start;
3708 }
3709 else {
3710 str += converted;
3711 len -= converted;
3712 }
3713 }
3714 /* failed to find the undecodable byte sequence */
3715 return 0;
3716#endif
3717 return 0;
3718}
3719
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003722 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003723{
3724 wchar_t smallbuf[256];
3725 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3726 wchar_t *wstr;
3727 size_t wlen, wlen2;
3728 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003729 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003730 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003731 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003732 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003733
3734 if (locale_error_handler(errors, &surrogateescape) < 0)
3735 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003736
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003737 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3738 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739 return NULL;
3740 }
3741
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003742 if (surrogateescape) {
3743 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003744 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745 if (wstr == NULL) {
3746 if (wlen == (size_t)-1)
3747 PyErr_NoMemory();
3748 else
3749 PyErr_SetFromErrno(PyExc_OSError);
3750 return NULL;
3751 }
3752
3753 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003754 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003757 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758#ifndef HAVE_BROKEN_MBSTOWCS
3759 wlen = mbstowcs(NULL, str, 0);
3760#else
3761 wlen = len;
3762#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003763 if (wlen == (size_t)-1)
3764 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765 if (wlen+1 <= smallbuf_len) {
3766 wstr = smallbuf;
3767 }
3768 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003769 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 if (!wstr)
3771 return PyErr_NoMemory();
3772 }
3773
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003774 wlen2 = mbstowcs(wstr, str, wlen+1);
3775 if (wlen2 == (size_t)-1) {
3776 if (wstr != smallbuf)
3777 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003778 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003779 }
3780#ifdef HAVE_BROKEN_MBSTOWCS
3781 assert(wlen2 == wlen);
3782#endif
3783 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3784 if (wstr != smallbuf)
3785 PyMem_Free(wstr);
3786 }
3787 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003788
3789decode_error:
3790 errmsg = strerror(errno);
3791 assert(errmsg != NULL);
3792
3793 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003794 wstr = Py_DecodeLocale(errmsg, &errlen);
3795 if (wstr != NULL) {
3796 reason = PyUnicode_FromWideChar(wstr, errlen);
3797 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003798 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003799
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003800 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003801 reason = PyUnicode_FromString(
3802 "mbstowcs() encountered an invalid multibyte sequence");
3803 if (reason == NULL)
3804 return NULL;
3805
3806 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3807 "locale", str, len,
3808 (Py_ssize_t)error_pos,
3809 (Py_ssize_t)(error_pos+1),
3810 reason);
3811 Py_DECREF(reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003814 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003815 }
3816 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817}
3818
3819PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003820PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003821{
3822 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003823 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826
3827PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003828PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003829 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003830 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3831}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003832
Christian Heimes5894ba72007-11-04 11:43:14 +00003833PyObject*
3834PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3835{
Steve Dowercc16be82016-09-08 10:35:16 -07003836#if defined(__APPLE__)
3837 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003838#else
Victor Stinner793b5312011-04-27 00:24:21 +02003839 PyInterpreterState *interp = PyThreadState_GET()->interp;
3840 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3841 cannot use it to encode and decode filenames before it is loaded. Load
3842 the Python codec requires to encode at least its own filename. Use the C
3843 version of the locale codec until the codec registry is initialized and
3844 the Python codec is loaded.
3845
3846 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3847 cannot only rely on it: check also interp->fscodec_initialized for
3848 subinterpreters. */
3849 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003850 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003852 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003853 }
3854 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003855 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856 }
Victor Stinnerad158722010-10-27 00:25:46 +00003857#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003858}
3859
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860
3861int
3862PyUnicode_FSConverter(PyObject* arg, void* addr)
3863{
Brett Cannonec6ce872016-09-06 15:50:29 -07003864 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865 PyObject *output = NULL;
3866 Py_ssize_t size;
3867 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003868 if (arg == NULL) {
3869 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003870 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003871 return 1;
3872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 path = PyOS_FSPath(arg);
3874 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003875 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003876 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003877 if (PyBytes_Check(path)) {
3878 output = path;
3879 }
3880 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3881 output = PyUnicode_EncodeFSDefault(path);
3882 Py_DECREF(path);
3883 if (!output) {
3884 return 0;
3885 }
3886 assert(PyBytes_Check(output));
3887 }
3888
Victor Stinner0ea2a462010-04-30 00:22:08 +00003889 size = PyBytes_GET_SIZE(output);
3890 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003891 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003892 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003893 Py_DECREF(output);
3894 return 0;
3895 }
3896 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003897 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898}
3899
3900
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003901int
3902PyUnicode_FSDecoder(PyObject* arg, void* addr)
3903{
Brett Cannona5711202016-09-06 19:36:01 -07003904 int is_buffer = 0;
3905 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 if (arg == NULL) {
3908 Py_DECREF(*(PyObject**)addr);
3909 return 1;
3910 }
Brett Cannona5711202016-09-06 19:36:01 -07003911
3912 is_buffer = PyObject_CheckBuffer(arg);
3913 if (!is_buffer) {
3914 path = PyOS_FSPath(arg);
3915 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003916 return 0;
3917 }
Brett Cannona5711202016-09-06 19:36:01 -07003918 }
3919 else {
3920 path = arg;
3921 Py_INCREF(arg);
3922 }
3923
3924 if (PyUnicode_Check(path)) {
3925 if (PyUnicode_READY(path) == -1) {
3926 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003928 }
3929 output = path;
3930 }
3931 else if (PyBytes_Check(path) || is_buffer) {
3932 PyObject *path_bytes = NULL;
3933
3934 if (!PyBytes_Check(path) &&
3935 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3936 "path should be string, bytes, or os.PathLike, not %.200s",
3937 Py_TYPE(arg)->tp_name)) {
3938 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003939 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003940 }
3941 path_bytes = PyBytes_FromObject(path);
3942 Py_DECREF(path);
3943 if (!path_bytes) {
3944 return 0;
3945 }
3946 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3947 PyBytes_GET_SIZE(path_bytes));
3948 Py_DECREF(path_bytes);
3949 if (!output) {
3950 return 0;
3951 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003952 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 else {
3954 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003955 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003956 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003957 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003958 return 0;
3959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003960 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003961 Py_DECREF(output);
3962 return 0;
3963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003965 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003966 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 Py_DECREF(output);
3968 return 0;
3969 }
3970 *(PyObject**)addr = output;
3971 return Py_CLEANUP_SUPPORTED;
3972}
3973
3974
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003975const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977{
Christian Heimesf3863112007-11-22 07:46:41 +00003978 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003980 if (!PyUnicode_Check(unicode)) {
3981 PyErr_BadArgument();
3982 return NULL;
3983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003987 if (PyUnicode_UTF8(unicode) == NULL) {
3988 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003989 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 if (bytes == NULL)
3991 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003992 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3993 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003994 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 Py_DECREF(bytes);
3996 return NULL;
3997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003999 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004000 PyBytes_AS_STRING(bytes),
4001 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 Py_DECREF(bytes);
4003 }
4004
4005 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 *psize = PyUnicode_UTF8_LENGTH(unicode);
4007 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004008}
4009
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004010const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4014}
4015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016Py_UNICODE *
4017PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 const unsigned char *one_byte;
4020#if SIZEOF_WCHAR_T == 4
4021 const Py_UCS2 *two_bytes;
4022#else
4023 const Py_UCS4 *four_bytes;
4024 const Py_UCS4 *ucs4_end;
4025 Py_ssize_t num_surrogates;
4026#endif
4027 wchar_t *w;
4028 wchar_t *wchar_end;
4029
4030 if (!PyUnicode_Check(unicode)) {
4031 PyErr_BadArgument();
4032 return NULL;
4033 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 assert(_PyUnicode_KIND(unicode) != 0);
4037 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004041 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4042 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 num_surrogates = 0;
4044
4045 for (; four_bytes < ucs4_end; ++four_bytes) {
4046 if (*four_bytes > 0xFFFF)
4047 ++num_surrogates;
4048 }
4049
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4051 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4052 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 PyErr_NoMemory();
4054 return NULL;
4055 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004058 w = _PyUnicode_WSTR(unicode);
4059 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4060 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4062 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004063 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004065 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4066 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 }
4068 else
4069 *w = *four_bytes;
4070
4071 if (w > wchar_end) {
4072 assert(0 && "Miscalculated string end");
4073 }
4074 }
4075 *w = 0;
4076#else
4077 /* sizeof(wchar_t) == 4 */
4078 Py_FatalError("Impossible unicode object state, wstr and str "
4079 "should share memory already.");
4080 return NULL;
4081#endif
4082 }
4083 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004084 if ((size_t)_PyUnicode_LENGTH(unicode) >
4085 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4086 PyErr_NoMemory();
4087 return NULL;
4088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004089 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4090 (_PyUnicode_LENGTH(unicode) + 1));
4091 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 PyErr_NoMemory();
4093 return NULL;
4094 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004095 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4096 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4097 w = _PyUnicode_WSTR(unicode);
4098 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4101 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 for (; w < wchar_end; ++one_byte, ++w)
4103 *w = *one_byte;
4104 /* null-terminate the wstr */
4105 *w = 0;
4106 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004107 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004109 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 for (; w < wchar_end; ++two_bytes, ++w)
4111 *w = *two_bytes;
4112 /* null-terminate the wstr */
4113 *w = 0;
4114#else
4115 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004116 PyObject_FREE(_PyUnicode_WSTR(unicode));
4117 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 Py_FatalError("Impossible unicode object state, wstr "
4119 "and str should share memory already.");
4120 return NULL;
4121#endif
4122 }
4123 else {
4124 assert(0 && "This should never happen.");
4125 }
4126 }
4127 }
4128 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004129 *size = PyUnicode_WSTR_LENGTH(unicode);
4130 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004131}
4132
Alexander Belopolsky40018472011-02-26 01:02:56 +00004133Py_UNICODE *
4134PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137}
4138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139
Alexander Belopolsky40018472011-02-26 01:02:56 +00004140Py_ssize_t
4141PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142{
4143 if (!PyUnicode_Check(unicode)) {
4144 PyErr_BadArgument();
4145 goto onError;
4146 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004147 if (_PyUnicode_WSTR(unicode) == NULL) {
4148 if (PyUnicode_AsUnicode(unicode) == NULL)
4149 goto onError;
4150 }
4151 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 return -1;
4155}
4156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157Py_ssize_t
4158PyUnicode_GetLength(PyObject *unicode)
4159{
Victor Stinner07621332012-06-16 04:53:46 +02004160 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 PyErr_BadArgument();
4162 return -1;
4163 }
Victor Stinner07621332012-06-16 04:53:46 +02004164 if (PyUnicode_READY(unicode) == -1)
4165 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 return PyUnicode_GET_LENGTH(unicode);
4167}
4168
4169Py_UCS4
4170PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4171{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004172 void *data;
4173 int kind;
4174
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004175 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4176 PyErr_BadArgument();
4177 return (Py_UCS4)-1;
4178 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004179 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004180 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 return (Py_UCS4)-1;
4182 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004183 data = PyUnicode_DATA(unicode);
4184 kind = PyUnicode_KIND(unicode);
4185 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186}
4187
4188int
4189PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4190{
4191 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004192 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return -1;
4194 }
Victor Stinner488fa492011-12-12 00:01:39 +01004195 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004196 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004197 PyErr_SetString(PyExc_IndexError, "string index out of range");
4198 return -1;
4199 }
Victor Stinner488fa492011-12-12 00:01:39 +01004200 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004201 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004202 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4203 PyErr_SetString(PyExc_ValueError, "character out of range");
4204 return -1;
4205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4207 index, ch);
4208 return 0;
4209}
4210
Alexander Belopolsky40018472011-02-26 01:02:56 +00004211const char *
4212PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004213{
Victor Stinner42cb4622010-09-01 19:39:01 +00004214 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004215}
4216
Victor Stinner554f3f02010-06-16 23:33:54 +00004217/* create or adjust a UnicodeDecodeError */
4218static void
4219make_decode_exception(PyObject **exceptionObject,
4220 const char *encoding,
4221 const char *input, Py_ssize_t length,
4222 Py_ssize_t startpos, Py_ssize_t endpos,
4223 const char *reason)
4224{
4225 if (*exceptionObject == NULL) {
4226 *exceptionObject = PyUnicodeDecodeError_Create(
4227 encoding, input, length, startpos, endpos, reason);
4228 }
4229 else {
4230 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4231 goto onError;
4232 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4233 goto onError;
4234 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4235 goto onError;
4236 }
4237 return;
4238
4239onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004240 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004241}
4242
Steve Dowercc16be82016-09-08 10:35:16 -07004243#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* error handling callback helper:
4245 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004246 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 and adjust various state variables.
4248 return 0 on success, -1 on error
4249*/
4250
Alexander Belopolsky40018472011-02-26 01:02:56 +00004251static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252unicode_decode_call_errorhandler_wchar(
4253 const char *errors, PyObject **errorHandler,
4254 const char *encoding, const char *reason,
4255 const char **input, const char **inend, Py_ssize_t *startinpos,
4256 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4257 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004259 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260
4261 PyObject *restuple = NULL;
4262 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004263 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t requiredsize;
4266 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004267 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 wchar_t *repwstr;
4269 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4272 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 *errorHandler = PyCodec_LookupError(errors);
4276 if (*errorHandler == NULL)
4277 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 }
4279
Victor Stinner554f3f02010-06-16 23:33:54 +00004280 make_decode_exception(exceptionObject,
4281 encoding,
4282 *input, *inend - *input,
4283 *startinpos, *endinpos,
4284 reason);
4285 if (*exceptionObject == NULL)
4286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004288 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004292 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297
4298 /* Copy back the bytes variables, which might have been modified by the
4299 callback */
4300 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4301 if (!inputobj)
4302 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *input = PyBytes_AS_STRING(inputobj);
4304 insize = PyBytes_GET_SIZE(inputobj);
4305 *inend = *input + insize;
4306 /* we can DECREF safely, as the exception has another reference,
4307 so the object won't go away. */
4308 Py_DECREF(inputobj);
4309
4310 if (newpos<0)
4311 newpos = insize+newpos;
4312 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004313 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 goto onError;
4315 }
4316
4317 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4318 if (repwstr == NULL)
4319 goto onError;
4320 /* need more space? (at least enough for what we
4321 have+the replacement+the rest of the string (starting
4322 at the new input position), so we won't have to check space
4323 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004324 requiredsize = *outpos;
4325 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4326 goto overflow;
4327 requiredsize += repwlen;
4328 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4329 goto overflow;
4330 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004332 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 requiredsize = 2*outsize;
4334 if (unicode_resize(output, requiredsize) < 0)
4335 goto onError;
4336 }
4337 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4338 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 *endinpos = newpos;
4340 *inptr = *input + newpos;
4341
4342 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004343 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 return 0;
4345
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 overflow:
4347 PyErr_SetString(PyExc_OverflowError,
4348 "decoded result is too long for a Python string");
4349
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 onError:
4351 Py_XDECREF(restuple);
4352 return -1;
4353}
Steve Dowercc16be82016-09-08 10:35:16 -07004354#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355
4356static int
4357unicode_decode_call_errorhandler_writer(
4358 const char *errors, PyObject **errorHandler,
4359 const char *encoding, const char *reason,
4360 const char **input, const char **inend, Py_ssize_t *startinpos,
4361 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365
4366 PyObject *restuple = NULL;
4367 PyObject *repunicode = NULL;
4368 Py_ssize_t insize;
4369 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004370 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 PyObject *inputobj = NULL;
4372
4373 if (*errorHandler == NULL) {
4374 *errorHandler = PyCodec_LookupError(errors);
4375 if (*errorHandler == NULL)
4376 goto onError;
4377 }
4378
4379 make_decode_exception(exceptionObject,
4380 encoding,
4381 *input, *inend - *input,
4382 *startinpos, *endinpos,
4383 reason);
4384 if (*exceptionObject == NULL)
4385 goto onError;
4386
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004387 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 if (restuple == NULL)
4389 goto onError;
4390 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004391 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 goto onError;
4393 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004394 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004396
4397 /* Copy back the bytes variables, which might have been modified by the
4398 callback */
4399 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4400 if (!inputobj)
4401 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004402 *input = PyBytes_AS_STRING(inputobj);
4403 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004405 /* we can DECREF safely, as the exception has another reference,
4406 so the object won't go away. */
4407 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004411 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Victor Stinner170ca6f2013-04-18 00:25:28 +02004416 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004417 if (replen > 1) {
4418 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004419 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422 goto onError;
4423 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004425 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004428 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004431 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437}
4438
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439/* --- UTF-7 Codec -------------------------------------------------------- */
4440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442
4443/* Three simple macros defining base-64. */
4444
4445/* Is c a base-64 character? */
4446
4447#define IS_BASE64(c) \
4448 (((c) >= 'A' && (c) <= 'Z') || \
4449 ((c) >= 'a' && (c) <= 'z') || \
4450 ((c) >= '0' && (c) <= '9') || \
4451 (c) == '+' || (c) == '/')
4452
4453/* given that c is a base-64 character, what is its base-64 value? */
4454
4455#define FROM_BASE64(c) \
4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459 (c) == '+' ? 62 : 63)
4460
4461/* What is the base-64 character of the bottom 6 bits of n? */
4462
4463#define TO_BASE64(n) \
4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself. We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471#define DECODE_DIRECT(c) \
4472 ((c) <= 127 && (c) != '+')
4473
4474/* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above). See RFC2152. This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 * alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 * !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 * ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489char utf7_category[128] = {
4490/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494/* sp ! " # $ % & ' ( ) * + , - . / */
4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498/* @ A B C D E F G H I J K L M N O */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502/* ` a b c d e f g h i j k l m n o */
4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504/* p q r s t u v w x y z { | } ~ del */
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506};
4507
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508/* ENCODE_DIRECT: this character should be encoded as itself. The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself. RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514#define ENCODE_DIRECT(c, directO, directWS) \
4515 ((c) < 128 && (c) > 0 && \
4516 ((utf7_category[(c)] == 0) || \
4517 (directWS && (utf7_category[(c)] == 2)) || \
4518 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Alexander Belopolsky40018472011-02-26 01:02:56 +00004520PyObject *
4521PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004522 Py_ssize_t size,
4523 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526}
4527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528/* The decoder. The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed. So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
4536PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t startinpos;
4543 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 const char *errmsg = "";
4547 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004548 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 unsigned int base64bits = 0;
4550 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004551 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *errorHandler = NULL;
4553 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004558 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004559 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004562 _PyUnicodeWriter_Init(&writer);
4563 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 e = s + size;
4567
4568 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004569 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004571 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (inShift) { /* in a base-64 section */
4574 if (IS_BASE64(ch)) { /* consume a base-64 character */
4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576 base64bits += 6;
4577 s++;
4578 if (base64bits >= 16) {
4579 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 base64bits -= 16;
4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004583 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (surrogate) {
4585 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 }
4593 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004595 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
4598 }
Victor Stinner551ac952011-11-29 22:58:13 +01004599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* first surrogate */
4601 surrogate = outCh;
4602 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
4608 }
4609 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 if (base64bits > 0) { /* left-over bits */
4612 if (base64bits >= 6) {
4613 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004614 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 errmsg = "partial character in shift sequence";
4616 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 else {
4619 /* Some bits remain; they should be zero */
4620 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 errmsg = "non-zero padding bits in shift sequence";
4623 goto utf7Error;
4624 }
4625 }
4626 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 if (surrogate && DECODE_DIRECT(ch)) {
4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629 goto onError;
4630 }
4631 surrogate = 0;
4632 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 /* '-' is absorbed; other terminating
4634 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004635 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
4638 }
4639 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 s++; /* consume '+' */
4642 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004645 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 }
4647 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004652 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 else {
4661 startinpos = s-starts;
4662 s++;
4663 errmsg = "unexpected special character";
4664 goto utf7Error;
4665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 errors, &errorHandler,
4671 "utf7", errmsg,
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
4676
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 /* end of string */
4678
4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004681 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 if (surrogate ||
4683 (base64bits >= 6) ||
4684 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 errors, &errorHandler,
4688 "utf7", "unterminated shift sequence",
4689 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 goto onError;
4692 if (s < e)
4693 goto restart;
4694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696
4697 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004701 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.kind, writer.data, shiftOutStart);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 _PyUnicodeWriter_Dealloc(&writer);
4707 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004708 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004709 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 }
4711 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724 return NULL;
4725}
4726
4727
Alexander Belopolsky40018472011-02-26 01:02:56 +00004728PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729_PyUnicode_EncodeUTF7(PyObject *str,
4730 int base64SetO,
4731 int base64WhiteSpace,
4732 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 int kind;
4735 void *data;
4736 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 char * out;
4743 char * start;
4744
Benjamin Petersonbac79492012-01-14 13:34:47 -05004745 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 return NULL;
4747 kind = PyUnicode_KIND(str);
4748 data = PyUnicode_DATA(str);
4749 len = PyUnicode_GET_LENGTH(str);
4750
4751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004755 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004756 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004757 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 if (v == NULL)
4759 return NULL;
4760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004761 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 if (inShift) {
4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767 /* shifting out */
4768 if (base64bits) { /* output remaining bits */
4769 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770 base64buffer = 0;
4771 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
4773 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 /* Characters not in the BASE64 set implicitly unshift the sequence
4775 so no '-' is required, except if the character is itself a '-' */
4776 if (IS_BASE64(ch) || ch == '-') {
4777 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 *out++ = (char) ch;
4780 }
4781 else {
4782 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004783 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 else { /* not in a shift sequence */
4786 if (ch == '+') {
4787 *out++ = '+';
4788 *out++ = '-';
4789 }
4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 *out++ = '+';
4795 inShift = 1;
4796 goto encode_char;
4797 }
4798 }
4799 continue;
4800encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004802 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004803
Antoine Pitrou244651a2009-05-04 18:56:13 +00004804 /* code first surrogate */
4805 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 while (base64bits >= 6) {
4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809 base64bits -= 6;
4810 }
4811 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004812 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 base64bits += 16;
4815 base64buffer = (base64buffer << 16) | ch;
4816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004820 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (base64bits)
4822 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004825 if (_PyBytes_Resize(&v, out - start) < 0)
4826 return NULL;
4827 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004829PyObject *
4830PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831 Py_ssize_t size,
4832 int base64SetO,
4833 int base64WhiteSpace,
4834 const char *errors)
4835{
4836 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004837 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838 if (tmp == NULL)
4839 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841 base64WhiteSpace, errors);
4842 Py_DECREF(tmp);
4843 return result;
4844}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846#undef IS_BASE64
4847#undef FROM_BASE64
4848#undef TO_BASE64
4849#undef DECODE_DIRECT
4850#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852/* --- UTF-8 Codec -------------------------------------------------------- */
4853
Alexander Belopolsky40018472011-02-26 01:02:56 +00004854PyObject *
4855PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004856 Py_ssize_t size,
4857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
Walter Dörwald69652032004-09-07 20:24:22 +00004859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860}
4861
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862#include "stringlib/asciilib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004866#include "stringlib/ucs1lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs2lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
4874#include "stringlib/ucs4lib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrouab868312009-01-10 15:40:25 +00004878/* Mask to quickly check whether a C 'long' contains a
4879 non-ASCII, UTF8-encoded char. */
4880#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004881# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004882#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004883# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004884#else
4885# error C 'long' size should be either 4 or 8!
4886#endif
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888static Py_ssize_t
4889ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004894 /*
4895 * Issue #17237: m68k is a bit different from most architectures in
4896 * that objects do not use "natural alignment" - for example, int and
4897 * long are only aligned at 2-byte boundaries. Therefore the assert()
4898 * won't work; also, tests have shown that skipping the "optimised
4899 * version" will even speed up m68k.
4900 */
4901#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 /* Fast path, see in STRINGLIB(utf8_decode) for
4906 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004907 /* Help allocation */
4908 const char *_p = p;
4909 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (_p < aligned_end) {
4911 unsigned long value = *(const unsigned long *) _p;
4912 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 *((unsigned long *)q) = value;
4915 _p += SIZEOF_LONG;
4916 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 p = _p;
4919 while (p < end) {
4920 if ((unsigned char)*p & 0x80)
4921 break;
4922 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004927#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (p < end) {
4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004932 /* Help allocation */
4933 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 while (_p < aligned_end) {
4935 unsigned long value = *(unsigned long *) _p;
4936 if (value & ASCII_CHAR_MASK)
4937 break;
4938 _p += SIZEOF_LONG;
4939 }
4940 p = _p;
4941 if (_p == end)
4942 break;
4943 }
4944 if ((unsigned char)*p & 0x80)
4945 break;
4946 ++p;
4947 }
4948 memcpy(dest, start, p - start);
4949 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950}
Antoine Pitrouab868312009-01-10 15:40:25 +00004951
Victor Stinner785938e2011-12-11 20:09:03 +01004952PyObject *
4953PyUnicode_DecodeUTF8Stateful(const char *s,
4954 Py_ssize_t size,
4955 const char *errors,
4956 Py_ssize_t *consumed)
4957{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004959 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004968
4969 if (size == 0) {
4970 if (consumed)
4971 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004973 }
4974
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004977 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 *consumed = 1;
4979 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004980 }
4981
Victor Stinner8f674cc2013-04-17 23:02:17 +02004982 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004983 writer.min_length = size;
4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004986
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 writer.pos = ascii_decode(s, end, writer.data);
4988 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 while (s < end) {
4990 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004992
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 if (PyUnicode_IS_ASCII(writer.buffer))
4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 } else {
5001 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 }
5004
5005 switch (ch) {
5006 case 0:
5007 if (s == end || consumed)
5008 goto End;
5009 errmsg = "unexpected end of data";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 case 1:
5014 errmsg = "invalid start byte";
5015 startinpos = s - starts;
5016 endinpos = startinpos + 1;
5017 break;
5018 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
5032 error_handler = get_error_handler(errors);
5033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Xavier de Gaye76febd02016-12-15 20:59:58 +01005086#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005087
5088/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005089 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005090
5091 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093
5094wchar_t*
5095_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 wchar_t *unicode;
5099 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100
5101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 if (!unicode)
5107 return NULL;
5108
5109 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 if (ch > 0xFF) {
5120#if SIZEOF_WCHAR_T == 4
5121 assert(0);
5122#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005123 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 /* compute and append the two surrogates: */
5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 else {
5130 if (!ch && s == e)
5131 break;
5132 /* surrogateescape */
5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 return unicode;
5138}
5139
Xavier de Gaye76febd02016-12-15 20:59:58 +01005140#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142/* Primary internal function which creates utf8 encoded bytes objects.
5143
5144 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005145 and allocate exactly as much space needed at the end. Else allocate the
5146 maximum possible needed (4 result bytes per Unicode character), and return
5147 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005148*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005149PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005150_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151{
Victor Stinner6099a032011-12-18 14:22:26 +01005152 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 void *data;
5154 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160
5161 if (PyUnicode_READY(unicode) == -1)
5162 return NULL;
5163
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005164 if (PyUnicode_UTF8(unicode))
5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167
5168 kind = PyUnicode_KIND(unicode);
5169 data = PyUnicode_DATA(unicode);
5170 size = PyUnicode_GET_LENGTH(unicode);
5171
Benjamin Petersonead6b532011-12-20 17:23:42 -06005172 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005173 default:
5174 assert(0);
5175 case PyUnicode_1BYTE_KIND:
5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177 assert(!PyUnicode_IS_ASCII(unicode));
5178 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179 case PyUnicode_2BYTE_KIND:
5180 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181 case PyUnicode_4BYTE_KIND:
5182 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188 Py_ssize_t size,
5189 const char *errors)
5190{
5191 PyObject *v, *unicode;
5192
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005193 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005194 if (unicode == NULL)
5195 return NULL;
5196 v = _PyUnicode_AsUTF8String(unicode, errors);
5197 Py_DECREF(unicode);
5198 return v;
5199}
5200
5201PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207/* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214{
5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216}
5217
5218PyObject *
5219PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
5225 const char *starts = s;
5226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005228 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005229 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005230 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005231 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 q = (unsigned char *)s;
5237 e = q + size;
5238
5239 if (byteorder)
5240 bo = *byteorder;
5241
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output
5245 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (bom == 0x0000FEFF) {
5249 bo = -1;
5250 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005252 else if (bom == 0xFFFE0000) {
5253 bo = 1;
5254 q += 4;
5255 }
5256 if (byteorder)
5257 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 }
5259
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (q == e) {
5261 if (consumed)
5262 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005263 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 }
5265
Victor Stinnere64322e2012-10-30 23:12:47 +01005266#ifdef WORDS_BIGENDIAN
5267 le = bo < 0;
5268#else
5269 le = bo <= 0;
5270#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005271 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005272
Victor Stinner8f674cc2013-04-17 23:02:17 +02005273 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005274 writer.min_length = (e - q + 3) / 4;
5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005276 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 while (1) {
5279 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 enum PyUnicode_Kind kind = writer.kind;
5284 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005286 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (le) {
5288 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 if (ch > maxch)
5291 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 q += 4;
5297 } while (q <= last);
5298 }
5299 else {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 }
5313
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005314 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 startinpos = ((const char *)q) - starts;
5317 endinpos = startinpos + 4;
5318 }
5319 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 startinpos = ((const char *)q) - starts;
5325 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 else {
5328 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 goto onError;
5331 q += 4;
5332 continue;
5333 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338
5339 /* The remaining input chars are ignored if the callback
5340 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005341 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347 }
5348
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 return NULL;
5361}
5362
5363PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364_PyUnicode_EncodeUTF32(PyObject *str,
5365 const char *errors,
5366 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 enum PyUnicode_Kind kind;
5369 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005371 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005372 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005373#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005378 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 if (!PyUnicode_Check(str)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005388 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 return NULL;
5390 kind = PyUnicode_KIND(str);
5391 data = PyUnicode_DATA(str);
5392 len = PyUnicode_GET_LENGTH(str);
5393
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005395 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005397 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398 if (v == NULL)
5399 return NULL;
5400
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 /* output buffer is 4-bytes aligned */
5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005403 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005406 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005412 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 else
5414 encoding = "utf-32";
5415
5416 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 pos = 0;
5422 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424
5425 if (kind == PyUnicode_2BYTE_KIND) {
5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 else {
5430 assert(kind == PyUnicode_4BYTE_KIND);
5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432 &out, native_ordering);
5433 }
5434 if (pos == len)
5435 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 rep = unicode_encode_call_errorhandler(
5438 errors, &errorHandler,
5439 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 if (!rep)
5442 goto error;
5443
5444 if (PyBytes_Check(rep)) {
5445 repsize = PyBytes_GET_SIZE(rep);
5446 if (repsize & 3) {
5447 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 "surrogates not allowed");
5450 goto error;
5451 }
5452 moreunits = repsize / 4;
5453 }
5454 else {
5455 assert(PyUnicode_Check(rep));
5456 if (PyUnicode_READY(rep) < 0)
5457 goto error;
5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459 if (!PyUnicode_IS_ASCII(rep)) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 }
5466
5467 /* four bytes are reserved for each surrogate */
5468 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 Py_ssize_t morebytes = 4 * (moreunits - 1);
5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472 /* integer overflow */
5473 PyErr_NoMemory();
5474 goto error;
5475 }
5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 }
5480
5481 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005482 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 Py_CLEAR(rep);
5491 }
5492
5493 /* Cut back to size actually needed. This is necessary for, for example,
5494 encoding of a string containing isolated surrogates and the 'ignore'
5495 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 if (nsize != PyBytes_GET_SIZE(v))
5498 _PyBytes_Resize(&v, nsize);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 error:
5504 Py_XDECREF(rep);
5505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
5507 Py_XDECREF(v);
5508 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005509}
5510
Alexander Belopolsky40018472011-02-26 01:02:56 +00005511PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513 Py_ssize_t size,
5514 const char *errors,
5515 int byteorder)
5516{
5517 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005518 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005519 if (tmp == NULL)
5520 return NULL;
5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522 Py_DECREF(tmp);
5523 return result;
5524}
5525
5526PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005527PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528{
Victor Stinnerb960b342011-11-20 19:12:52 +01005529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530}
5531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532/* --- UTF-16 Codec ------------------------------------------------------- */
5533
Tim Peters772747b2001-08-09 22:21:55 +00005534PyObject *
5535PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_ssize_t size,
5537 const char *errors,
5538 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539{
Walter Dörwald69652032004-09-07 20:24:22 +00005540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541}
5542
5543PyObject *
5544PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder,
5548 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t startinpos;
5552 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005553 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005555 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005556 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005557 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 PyObject *errorHandler = NULL;
5559 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Tim Peters772747b2001-08-09 22:21:55 +00005562 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
5565 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005566 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (bo == 0 && size >= 2) {
5573 const Py_UCS4 bom = (q[1] << 8) | q[0];
5574 if (bom == 0xFEFF) {
5575 q += 2;
5576 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 else if (bom == 0xFFFE) {
5579 q += 2;
5580 bo = 1;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005589 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005590 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005591
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005595#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005598#endif
Tim Peters772747b2001-08-09 22:21:55 +00005599
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 /* Note: size will always be longer than the resulting Unicode
5601 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005602 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005603 writer.min_length = (e - q + 1) / 2;
5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 while (1) {
5608 Py_UCS4 ch = 0;
5609 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering);
5616 else
5617 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 native_ordering);
5620 } else if (kind == PyUnicode_2BYTE_KIND) {
5621 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 native_ordering);
5624 } else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005629 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 switch (ch)
5633 {
5634 case 0:
5635 /* remaining byte at the end? (size should be even) */
5636 if (q == e || consumed)
5637 goto End;
5638 errmsg = "truncated data";
5639 startinpos = ((const char *)q) - starts;
5640 endinpos = ((const char *)e) - starts;
5641 break;
5642 /* The remaining input chars are ignored if the callback
5643 chooses to skip the input */
5644 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005645 q -= 2;
5646 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005647 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005649 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005650 endinpos = ((const char *)e) - starts;
5651 break;
5652 case 2:
5653 errmsg = "illegal encoding";
5654 startinpos = ((const char *)q) - 2 - starts;
5655 endinpos = startinpos + 2;
5656 break;
5657 case 3:
5658 errmsg = "illegal UTF-16 surrogate";
5659 startinpos = ((const char *)q) - 4 - starts;
5660 endinpos = startinpos + 2;
5661 break;
5662 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 continue;
5666 }
5667
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005669 errors,
5670 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005672 &starts,
5673 (const char **)&e,
5674 &startinpos,
5675 &endinpos,
5676 &exc,
5677 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
5681
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682End:
Walter Dörwald69652032004-09-07 20:24:22 +00005683 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 Py_XDECREF(errorHandler);
5693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return NULL;
5695}
5696
Tim Peters772747b2001-08-09 22:21:55 +00005697PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698_PyUnicode_EncodeUTF16(PyObject *str,
5699 const char *errors,
5700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005702 enum PyUnicode_Kind kind;
5703 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005704 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005706 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005708#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005710#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005712#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 const char *encoding;
5714 Py_ssize_t nsize, pos;
5715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
5717 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 if (kind == PyUnicode_4BYTE_KIND) {
5731 const Py_UCS4 *in = (const Py_UCS4 *)data;
5732 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 while (in < end) {
5734 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005736 }
5737 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 nsize = len + pairs + (byteorder == 0);
5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005752 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
5754 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005755 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Tim Peters772747b2001-08-09 22:21:55 +00005757
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (kind == PyUnicode_1BYTE_KIND) {
5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005761 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005762
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772
5773 pos = 0;
5774 while (pos < len) {
5775 Py_ssize_t repsize, moreunits;
5776
5777 if (kind == PyUnicode_2BYTE_KIND) {
5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 else {
5782 assert(kind == PyUnicode_4BYTE_KIND);
5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784 &out, native_ordering);
5785 }
5786 if (pos == len)
5787 break;
5788
5789 rep = unicode_encode_call_errorhandler(
5790 errors, &errorHandler,
5791 encoding, "surrogates not allowed",
5792 str, &exc, pos, pos + 1, &pos);
5793 if (!rep)
5794 goto error;
5795
5796 if (PyBytes_Check(rep)) {
5797 repsize = PyBytes_GET_SIZE(rep);
5798 if (repsize & 1) {
5799 raise_encode_exception(&exc, encoding,
5800 str, pos - 1, pos,
5801 "surrogates not allowed");
5802 goto error;
5803 }
5804 moreunits = repsize / 2;
5805 }
5806 else {
5807 assert(PyUnicode_Check(rep));
5808 if (PyUnicode_READY(rep) < 0)
5809 goto error;
5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811 if (!PyUnicode_IS_ASCII(rep)) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 }
5818
5819 /* two bytes are reserved for each surrogate */
5820 if (moreunits > 1) {
5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822 Py_ssize_t morebytes = 2 * (moreunits - 1);
5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829 goto error;
5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831 }
5832
5833 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 out += moreunits;
5836 } else /* rep is unicode */ {
5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
5840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore' handler
5847 is used. */
5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
5861#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862}
5863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866 Py_ssize_t size,
5867 const char *errors,
5868 int byteorder)
5869{
5870 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005871 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 if (tmp == NULL)
5873 return NULL;
5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875 Py_DECREF(tmp);
5876 return result;
5877}
5878
5879PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885/* --- Unicode Escape Codec ----------------------------------------------- */
5886
Fredrik Lundh06d12682001-01-24 07:59:11 +00005887static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005890_PyUnicode_DecodeUnicodeEscape(const char *s,
5891 Py_ssize_t size,
5892 const char *errors,
5893 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 PyObject *errorHandler = NULL;
5899 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005900
Eric V. Smith42454af2016-10-31 09:22:08 -04005901 // so we can remember if we've seen an invalid escape char or not
5902 *first_invalid_escape = NULL;
5903
Victor Stinner62ec3312016-09-06 17:04:34 -07005904 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005905 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005906 }
5907 /* Escaped strings will always be longer than the resulting
5908 Unicode string, so we start with size here and then reduce the
5909 length after conversion to the true value.
5910 (but if the error callback returns a long replacement string
5911 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005912 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005913 writer.min_length = size;
5914 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916 }
5917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 end = s + size;
5919 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005920 unsigned char c = (unsigned char) *s++;
5921 Py_UCS4 ch;
5922 int count;
5923 Py_ssize_t startinpos;
5924 Py_ssize_t endinpos;
5925 const char *message;
5926
5927#define WRITE_ASCII_CHAR(ch) \
5928 do { \
5929 assert(ch <= 127); \
5930 assert(writer.pos < writer.size); \
5931 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932 } while(0)
5933
5934#define WRITE_CHAR(ch) \
5935 do { \
5936 if (ch <= writer.maxchar) { \
5937 assert(writer.pos < writer.size); \
5938 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939 } \
5940 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941 goto onError; \
5942 } \
5943 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
5945 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005946 if (c != '\\') {
5947 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 continue;
5949 }
5950
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005953 if (s >= end) {
5954 message = "\\ at end of string";
5955 goto error;
5956 }
5957 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005960 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 case '\n': continue;
5964 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965 case '\'': WRITE_ASCII_CHAR('\''); continue;
5966 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970 case 't': WRITE_ASCII_CHAR('\t'); continue;
5971 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005976 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 case '0': case '1': case '2': case '3':
5980 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 ch = (ch<<3) + *s++ - '0';
5984 if (s < end && '0' <= *s && *s <= '7') {
5985 ch = (ch<<3) + *s++ - '0';
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 WRITE_CHAR(ch);
5989 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* hex escapes */
5992 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005995 message = "truncated \\xXX escape";
5996 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006001 message = "truncated \\uXXXX escape";
6002 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006005 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 message = "truncated \\UXXXXXXXX escape";
6008 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006010 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 ch <<= 4;
6012 if (c >= '0' && c <= '9') {
6013 ch += c - '0';
6014 }
6015 else if (c >= 'a' && c <= 'f') {
6016 ch += c - ('a' - 10);
6017 }
6018 else if (c >= 'A' && c <= 'F') {
6019 ch += c - ('A' - 10);
6020 }
6021 else {
6022 break;
6023 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006024 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006025 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006026 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 }
6028
6029 /* when we get here, ch is a 32-bit unicode character */
6030 if (ch > MAX_UNICODE) {
6031 message = "illegal Unicode character";
6032 goto error;
6033 }
6034
6035 WRITE_CHAR(ch);
6036 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 if (ucnhash_CAPI == NULL) {
6041 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006042 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 if (ucnhash_CAPI == NULL) {
6045 PyErr_SetString(
6046 PyExc_UnicodeError,
6047 "\\N escapes not supported (can't load unicodedata module)"
6048 );
6049 goto onError;
6050 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006052
6053 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 const char *start = ++s;
6056 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006058 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 namelen = s - start;
6061 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 ch = 0xffffffff; /* in case 'getcode' messes up */
6065 if (namelen <= INT_MAX &&
6066 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067 &ch, 0)) {
6068 assert(ch <= MAX_UNICODE);
6069 WRITE_CHAR(ch);
6070 continue;
6071 }
6072 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 }
6074 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006075 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076
6077 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006078 if (*first_invalid_escape == NULL) {
6079 *first_invalid_escape = s-1; /* Back up one char, since we've
6080 already incremented s. */
6081 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 WRITE_ASCII_CHAR('\\');
6083 WRITE_CHAR(c);
6084 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086
6087 error:
6088 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006090 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006091 errors, &errorHandler,
6092 "unicodeescape", message,
6093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 }
6097 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098 goto onError;
6099 }
6100
6101#undef WRITE_ASCII_CHAR
6102#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006104
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 return NULL;
6114}
6115
Eric V. Smith42454af2016-10-31 09:22:08 -04006116PyObject *
6117PyUnicode_DecodeUnicodeEscape(const char *s,
6118 Py_ssize_t size,
6119 const char *errors)
6120{
6121 const char *first_invalid_escape;
6122 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123 &first_invalid_escape);
6124 if (result == NULL)
6125 return NULL;
6126 if (first_invalid_escape != NULL) {
6127 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128 "invalid escape sequence '\\%c'",
6129 *first_invalid_escape) < 0) {
6130 Py_DECREF(result);
6131 return NULL;
6132 }
6133 }
6134 return result;
6135}
6136
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006137/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Alexander Belopolsky40018472011-02-26 01:02:56 +00006139PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006150 escape.
6151
Ezio Melottie7f90372012-10-05 03:33:31 +03006152 For UCS1 strings it's '\xxx', 4 bytes per source character.
6153 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006155 */
6156
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 if (!PyUnicode_Check(unicode)) {
6158 PyErr_BadArgument();
6159 return NULL;
6160 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 }
Victor Stinner358af132015-10-12 22:36:57 +02006164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 if (len == 0) {
6167 return PyBytes_FromStringAndSize(NULL, 0);
6168 }
6169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173 bytes, and 1 byte characters 4. */
6174 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006175 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 return PyErr_NoMemory();
6177 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006178 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 if (repr == NULL) {
6180 return NULL;
6181 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006185 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 /* U+0000-U+00ff range */
6188 if (ch < 0x100) {
6189 if (ch >= ' ' && ch < 127) {
6190 if (ch != '\\') {
6191 /* Copy printable US ASCII as-is */
6192 *p++ = (char) ch;
6193 }
6194 /* Escape backslashes */
6195 else {
6196 *p++ = '\\';
6197 *p++ = '\\';
6198 }
6199 }
Victor Stinner358af132015-10-12 22:36:57 +02006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 /* Map special whitespace to '\t', \n', '\r' */
6202 else if (ch == '\t') {
6203 *p++ = '\\';
6204 *p++ = 't';
6205 }
6206 else if (ch == '\n') {
6207 *p++ = '\\';
6208 *p++ = 'n';
6209 }
6210 else if (ch == '\r') {
6211 *p++ = '\\';
6212 *p++ = 'r';
6213 }
6214
6215 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216 else {
6217 *p++ = '\\';
6218 *p++ = 'x';
6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220 *p++ = Py_hexdigits[ch & 0x000F];
6221 }
Tim Petersced69f82003-09-16 20:30:58 +00006222 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006223 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 *p++ = '\\';
6226 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006227 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 /* Make sure that the first two digits are zero */
6236 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006237 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 *p++ = 'U';
6239 *p++ = '0';
6240 *p++ = '0';
6241 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 assert(p - PyBytes_AS_STRING(repr) > 0);
6251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252 return NULL;
6253 }
6254 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255}
6256
Alexander Belopolsky40018472011-02-26 01:02:56 +00006257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006261 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006262 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 }
6266
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006267 result = PyUnicode_AsUnicodeEscapeString(tmp);
6268 Py_DECREF(tmp);
6269 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270}
6271
6272/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274PyObject *
6275PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006276 Py_ssize_t size,
6277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006280 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 PyObject *errorHandler = NULL;
6283 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Victor Stinner62ec3312016-09-06 17:04:34 -07006285 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006286 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Escaped strings will always be longer than the resulting
6290 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 length after conversion to the true value. (But decoding error
6292 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006293 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 writer.min_length = size;
6295 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296 goto onError;
6297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 end = s + size;
6300 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 unsigned char c = (unsigned char) *s++;
6302 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006303 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 Py_ssize_t startinpos;
6305 Py_ssize_t endinpos;
6306 const char *message;
6307
6308#define WRITE_CHAR(ch) \
6309 do { \
6310 if (ch <= writer.maxchar) { \
6311 assert(writer.pos < writer.size); \
6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313 } \
6314 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315 goto onError; \
6316 } \
6317 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 if (c != '\\' || s >= end) {
6321 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006324
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 c = (unsigned char) *s++;
6326 if (c == 'u') {
6327 count = 4;
6328 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 else if (c == 'U') {
6331 count = 8;
6332 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006333 }
6334 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 assert(writer.pos < writer.size);
6336 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337 WRITE_CHAR(c);
6338 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 startinpos = s - starts - 2;
6341
6342 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343 for (ch = 0; count && s < end; ++s, --count) {
6344 c = (unsigned char)*s;
6345 ch <<= 4;
6346 if (c >= '0' && c <= '9') {
6347 ch += c - '0';
6348 }
6349 else if (c >= 'a' && c <= 'f') {
6350 ch += c - ('a' - 10);
6351 }
6352 else if (c >= 'A' && c <= 'F') {
6353 ch += c - ('A' - 10);
6354 }
6355 else {
6356 break;
6357 }
6358 }
6359 if (!count) {
6360 if (ch <= MAX_UNICODE) {
6361 WRITE_CHAR(ch);
6362 continue;
6363 }
6364 message = "\\Uxxxxxxxx out of range";
6365 }
6366
6367 endinpos = s-starts;
6368 writer.min_length = end - s + writer.pos;
6369 if (unicode_decode_call_errorhandler_writer(
6370 errors, &errorHandler,
6371 "rawunicodeescape", message,
6372 &starts, &end, &startinpos, &endinpos, &exc, &s,
6373 &writer)) {
6374 goto onError;
6375 }
6376 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377 goto onError;
6378 }
6379
6380#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 Py_XDECREF(errorHandler);
6383 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006385
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006387 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392}
6393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 int kind;
6402 void *data;
6403 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405 if (!PyUnicode_Check(unicode)) {
6406 PyErr_BadArgument();
6407 return NULL;
6408 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 kind = PyUnicode_KIND(unicode);
6413 data = PyUnicode_DATA(unicode);
6414 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (kind == PyUnicode_1BYTE_KIND) {
6416 return PyBytes_FromStringAndSize(data, len);
6417 }
Victor Stinner0e368262011-11-10 20:12:49 +01006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420 bytes, and 1 byte characters 4. */
6421 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (len > PY_SSIZE_T_MAX / expandsize) {
6424 return PyErr_NoMemory();
6425 }
6426 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427 if (repr == NULL) {
6428 return NULL;
6429 }
6430 if (len == 0) {
6431 return repr;
6432 }
6433
6434 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 for (pos = 0; pos < len; pos++) {
6436 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439 if (ch < 0x100) {
6440 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006441 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 *p++ = '\\';
6445 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006446 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452 else {
6453 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454 *p++ = '\\';
6455 *p++ = 'U';
6456 *p++ = '0';
6457 *p++ = '0';
6458 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
6464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006466
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 assert(p > PyBytes_AS_STRING(repr));
6468 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469 return NULL;
6470 }
6471 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006475PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006478 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006479 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006481 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006482 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483 Py_DECREF(tmp);
6484 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485}
6486
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487/* --- Unicode Internal Codec ------------------------------------------- */
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
6490_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006491 Py_ssize_t size,
6492 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006493{
6494 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006497 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498 const char *end;
6499 const char *reason;
6500 PyObject *errorHandler = NULL;
6501 PyObject *exc = NULL;
6502
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006504 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 1))
6506 return NULL;
6507
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006508 if (size == 0)
6509 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006510
Victor Stinner8f674cc2013-04-17 23:02:17 +02006511 _PyUnicodeWriter_Init(&writer);
6512 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006515 }
6516 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517
Victor Stinner8f674cc2013-04-17 23:02:17 +02006518 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006520 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006521 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006522 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006523 endinpos = end-starts;
6524 reason = "truncated input";
6525 goto error;
6526 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006527 /* We copy the raw representation one byte at a time because the
6528 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006529 ((char *) &uch)[0] = s[0];
6530 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006531#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006532 ((char *) &uch)[2] = s[2];
6533 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006534#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006536#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 /* We have to sanity check the raw data, otherwise doom looms for
6538 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006539 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006540 endinpos = s - starts + Py_UNICODE_SIZE;
6541 reason = "illegal code point (> 0x10FFFF)";
6542 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006543 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006544#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006545 s += Py_UNICODE_SIZE;
6546#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006548 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 Py_UNICODE uch2;
6550 ((char *) &uch2)[0] = s[0];
6551 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006552 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 {
Victor Stinner551ac952011-11-29 22:58:13 +01006554 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006555 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 }
6557 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558#endif
6559
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562 continue;
6563
6564 error:
6565 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006566 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006567 errors, &errorHandler,
6568 "unicode_internal", reason,
6569 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006570 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006571 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 }
6573
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006576 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006577
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006579 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
6582 return NULL;
6583}
6584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585/* --- Latin-1 Codec ------------------------------------------------------ */
6586
Alexander Belopolsky40018472011-02-26 01:02:56 +00006587PyObject *
6588PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006589 Py_ssize_t size,
6590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006593 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static void
6598make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 PyObject *unicode,
6601 Py_ssize_t startpos, Py_ssize_t endpos,
6602 const char *reason)
6603{
6604 if (*exceptionObject == NULL) {
6605 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006607 encoding, unicode, startpos, endpos, reason);
6608 }
6609 else {
6610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613 goto onError;
6614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615 goto onError;
6616 return;
6617 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006618 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006619 }
6620}
6621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623static void
6624raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006626 PyObject *unicode,
6627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 const char *reason)
6629{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006630 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006631 encoding, unicode, startpos, endpos, reason);
6632 if (*exceptionObject != NULL)
6633 PyCodec_StrictErrors(*exceptionObject);
6634}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635
6636/* error handling callback helper:
6637 build arguments, call the callback and check the arguments,
6638 put the result into newpos and return the replacement string, which
6639 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
6641unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 PyObject **errorHandler,
6643 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006648 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 PyObject *restuple;
6651 PyObject *resunicode;
6652
6653 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 }
6658
Benjamin Petersonbac79492012-01-14 13:34:47 -05006659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 return NULL;
6661 len = PyUnicode_GET_LENGTH(unicode);
6662
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006663 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006668 restuple = PyObject_CallFunctionObjArgs(
6669 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006673 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 Py_DECREF(restuple);
6675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006677 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 &resunicode, newpos)) {
6679 Py_DECREF(restuple);
6680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006682 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684 Py_DECREF(restuple);
6685 return NULL;
6686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006688 *newpos = len + *newpos;
6689 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 Py_DECREF(restuple);
6692 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006693 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 Py_INCREF(resunicode);
6695 Py_DECREF(restuple);
6696 return resunicode;
6697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006701 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006702 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 /* input state */
6705 Py_ssize_t pos=0, size;
6706 int kind;
6707 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 /* pointer into the output */
6709 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006710 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006712 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006714 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006715 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006716 /* output object */
6717 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718
Benjamin Petersonbac79492012-01-14 13:34:47 -05006719 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 return NULL;
6721 size = PyUnicode_GET_LENGTH(unicode);
6722 kind = PyUnicode_KIND(unicode);
6723 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 /* allocate enough for a simple encoding without
6725 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006726 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006727 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728
6729 _PyBytesWriter_Init(&writer);
6730 str = _PyBytesWriter_Alloc(&writer, size);
6731 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006735 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006738 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006740 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006744 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006747 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006749
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006750 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006752
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006753 /* Only overallocate the buffer if it's not the last write */
6754 writer.overallocate = (collend < size);
6755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006757 if (error_handler == _Py_ERROR_UNKNOWN)
6758 error_handler = get_error_handler(errors);
6759
6760 switch (error_handler) {
6761 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006762 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006764
6765 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006766 memset(str, '?', collend - collstart);
6767 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006768 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006769 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 break;
Victor Stinner50149202015-09-22 00:26:54 +02006772
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006773 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006774 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006775 writer.min_size -= (collend - collstart);
6776 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006777 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006778 if (str == NULL)
6779 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006780 pos = collend;
6781 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006782
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006783 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006784 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006785 writer.min_size -= (collend - collstart);
6786 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 unicode, collstart, collend);
6788 if (str == NULL)
6789 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 break;
Victor Stinner50149202015-09-22 00:26:54 +02006792
Victor Stinnerc3713e92015-09-29 12:32:13 +02006793 case _Py_ERROR_SURROGATEESCAPE:
6794 for (i = collstart; i < collend; ++i) {
6795 ch = PyUnicode_READ(kind, data, i);
6796 if (ch < 0xdc80 || 0xdcff < ch) {
6797 /* Not a UTF-8b surrogate */
6798 break;
6799 }
6800 *str++ = (char)(ch - 0xdc00);
6801 ++pos;
6802 }
6803 if (i >= collend)
6804 break;
6805 collstart = pos;
6806 assert(collstart != collend);
6807 /* fallback to general error handling */
6808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006810 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811 encoding, reason, unicode, &exc,
6812 collstart, collend, &newpos);
6813 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006815
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006816 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006817 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006818
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006820 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006821 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006822 PyBytes_AS_STRING(rep),
6823 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006824 if (str == NULL)
6825 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006826 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 else {
6828 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006833 if (limit == 256 ?
6834 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6835 !PyUnicode_IS_ASCII(rep))
6836 {
6837 /* Not all characters are smaller than limit */
6838 raise_encode_exception(&exc, encoding, unicode,
6839 collstart, collend, reason);
6840 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006842 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6843 str = _PyBytesWriter_WriteBytes(&writer, str,
6844 PyUnicode_DATA(rep),
6845 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006849 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850
6851 /* If overallocation was disabled, ensure that it was the last
6852 write. Otherwise, we missed an optimization */
6853 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006854 }
6855 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006856
Victor Stinner50149202015-09-22 00:26:54 +02006857 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006859 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006860
6861 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006864 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006865 Py_XDECREF(exc);
6866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867}
6868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyObject *
6871PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006872 Py_ssize_t size,
6873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006876 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 if (unicode == NULL)
6878 return NULL;
6879 result = unicode_encode_ucs1(unicode, errors, 256);
6880 Py_DECREF(unicode);
6881 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882}
6883
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006885_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 PyErr_BadArgument();
6889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891 if (PyUnicode_READY(unicode) == -1)
6892 return NULL;
6893 /* Fast path: if it is a one-byte string, construct
6894 bytes object directly. */
6895 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897 PyUnicode_GET_LENGTH(unicode));
6898 /* Non-Latin-1 characters present. Defer to above function to
6899 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901}
6902
6903PyObject*
6904PyUnicode_AsLatin1String(PyObject *unicode)
6905{
6906 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
6909/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_DecodeASCII(const char *s,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006917 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006918 int kind;
6919 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920 Py_ssize_t startinpos;
6921 Py_ssize_t endinpos;
6922 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006924 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006926 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006929 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006932 if (size == 1 && (unsigned char)s[0] < 128)
6933 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006934
Victor Stinner8f674cc2013-04-17 23:02:17 +02006935 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006936 writer.min_length = size;
6937 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006938 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006941 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006942 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006943 writer.pos = outpos;
6944 if (writer.pos == size)
6945 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006946
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006947 s += writer.pos;
6948 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006949 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006950 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006952 PyUnicode_WRITE(kind, data, writer.pos, c);
6953 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006957
6958 /* byte outsize range 0x00..0x7f: call the error handler */
6959
6960 if (error_handler == _Py_ERROR_UNKNOWN)
6961 error_handler = get_error_handler(errors);
6962
6963 switch (error_handler)
6964 {
6965 case _Py_ERROR_REPLACE:
6966 case _Py_ERROR_SURROGATEESCAPE:
6967 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006968 but we may switch to UCS2 at the first write */
6969 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6970 goto onError;
6971 kind = writer.kind;
6972 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973
6974 if (error_handler == _Py_ERROR_REPLACE)
6975 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6976 else
6977 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6978 writer.pos++;
6979 ++s;
6980 break;
6981
6982 case _Py_ERROR_IGNORE:
6983 ++s;
6984 break;
6985
6986 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 startinpos = s-starts;
6988 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 "ascii", "ordinal not in range(128)",
6992 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006993 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006995 kind = writer.kind;
6996 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006999 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007002
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007005 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return NULL;
7008}
7009
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011PyObject *
7012PyUnicode_EncodeASCII(const Py_UNICODE *p,
7013 Py_ssize_t size,
7014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007017 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 if (unicode == NULL)
7019 return NULL;
7020 result = unicode_encode_ucs1(unicode, errors, 128);
7021 Py_DECREF(unicode);
7022 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
7028 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyErr_BadArgument();
7030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007032 if (PyUnicode_READY(unicode) == -1)
7033 return NULL;
7034 /* Fast path: if it is an ASCII-only string, construct bytes object
7035 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007036 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040}
7041
7042PyObject *
7043PyUnicode_AsASCIIString(PyObject *unicode)
7044{
7045 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Steve Dowercc16be82016-09-08 10:35:16 -07007048#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007049
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007050/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007051
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007052#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053#define NEED_RETRY
7054#endif
7055
Victor Stinner3a50e702011-10-18 21:21:00 +02007056#ifndef WC_ERR_INVALID_CHARS
7057# define WC_ERR_INVALID_CHARS 0x0080
7058#endif
7059
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007060static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007061code_page_name(UINT code_page, PyObject **obj)
7062{
7063 *obj = NULL;
7064 if (code_page == CP_ACP)
7065 return "mbcs";
7066 if (code_page == CP_UTF7)
7067 return "CP_UTF7";
7068 if (code_page == CP_UTF8)
7069 return "CP_UTF8";
7070
7071 *obj = PyBytes_FromFormat("cp%u", code_page);
7072 if (*obj == NULL)
7073 return NULL;
7074 return PyBytes_AS_STRING(*obj);
7075}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077static DWORD
7078decode_code_page_flags(UINT code_page)
7079{
7080 if (code_page == CP_UTF7) {
7081 /* The CP_UTF7 decoder only supports flags=0 */
7082 return 0;
7083 }
7084 else
7085 return MB_ERR_INVALID_CHARS;
7086}
7087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 * Decode a byte string from a Windows code page into unicode object in strict
7090 * mode.
7091 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007092 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7093 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007096decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 const char *in,
7099 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100{
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007102 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
7105 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 assert(insize > 0);
7107 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7108 if (outsize <= 0)
7109 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110
7111 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007113 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 if (*v == NULL)
7116 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118 }
7119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007122 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125 }
7126
7127 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7129 if (outsize <= 0)
7130 goto error;
7131 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133error:
7134 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7135 return -2;
7136 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007137 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138}
7139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140/*
7141 * Decode a byte string from a code page into unicode object with an error
7142 * handler.
7143 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007144 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 * UnicodeDecodeError exception and returns -1 on error.
7146 */
7147static int
7148decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007149 PyObject **v,
7150 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007151 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007152{
7153 const char *startin = in;
7154 const char *endin = in + size;
7155 const DWORD flags = decode_code_page_flags(code_page);
7156 /* Ideally, we should get reason from FormatMessage. This is the Windows
7157 2000 English version of the message. */
7158 const char *reason = "No mapping for the Unicode character exists "
7159 "in the target code page.";
7160 /* each step cannot decode more than 1 character, but a character can be
7161 represented as a surrogate pair */
7162 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007163 int insize;
7164 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 PyObject *errorHandler = NULL;
7166 PyObject *exc = NULL;
7167 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007168 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 DWORD err;
7170 int ret = -1;
7171
7172 assert(size > 0);
7173
7174 encoding = code_page_name(code_page, &encoding_obj);
7175 if (encoding == NULL)
7176 return -1;
7177
Victor Stinner7d00cc12014-03-17 23:08:06 +01007178 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7180 UnicodeDecodeError. */
7181 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7182 if (exc != NULL) {
7183 PyCodec_StrictErrors(exc);
7184 Py_CLEAR(exc);
7185 }
7186 goto error;
7187 }
7188
7189 if (*v == NULL) {
7190 /* Create unicode object */
7191 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7192 PyErr_NoMemory();
7193 goto error;
7194 }
Victor Stinnerab595942011-12-17 04:59:06 +01007195 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007196 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 if (*v == NULL)
7198 goto error;
7199 startout = PyUnicode_AS_UNICODE(*v);
7200 }
7201 else {
7202 /* Extend unicode object */
7203 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7204 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007208 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 goto error;
7210 startout = PyUnicode_AS_UNICODE(*v) + n;
7211 }
7212
7213 /* Decode the byte string character per character */
7214 out = startout;
7215 while (in < endin)
7216 {
7217 /* Decode a character */
7218 insize = 1;
7219 do
7220 {
7221 outsize = MultiByteToWideChar(code_page, flags,
7222 in, insize,
7223 buffer, Py_ARRAY_LENGTH(buffer));
7224 if (outsize > 0)
7225 break;
7226 err = GetLastError();
7227 if (err != ERROR_NO_UNICODE_TRANSLATION
7228 && err != ERROR_INSUFFICIENT_BUFFER)
7229 {
7230 PyErr_SetFromWindowsErr(0);
7231 goto error;
7232 }
7233 insize++;
7234 }
7235 /* 4=maximum length of a UTF-8 sequence */
7236 while (insize <= 4 && (in + insize) <= endin);
7237
7238 if (outsize <= 0) {
7239 Py_ssize_t startinpos, endinpos, outpos;
7240
Victor Stinner7d00cc12014-03-17 23:08:06 +01007241 /* last character in partial decode? */
7242 if (in + insize >= endin && !final)
7243 break;
7244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 startinpos = in - startin;
7246 endinpos = startinpos + 1;
7247 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007248 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 errors, &errorHandler,
7250 encoding, reason,
7251 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007252 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 {
7254 goto error;
7255 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007256 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 }
7258 else {
7259 in += insize;
7260 memcpy(out, buffer, outsize * sizeof(wchar_t));
7261 out += outsize;
7262 }
7263 }
7264
7265 /* write a NUL character at the end */
7266 *out = 0;
7267
7268 /* Extend unicode object */
7269 outsize = out - startout;
7270 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007271 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007273 /* (in - startin) <= size and size is an int */
7274 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007275
7276error:
7277 Py_XDECREF(encoding_obj);
7278 Py_XDECREF(errorHandler);
7279 Py_XDECREF(exc);
7280 return ret;
7281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283static PyObject *
7284decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 const char *s, Py_ssize_t size,
7286 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287{
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 PyObject *v = NULL;
7289 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 if (code_page < 0) {
7292 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7293 return NULL;
7294 }
7295
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 do
7300 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 if (size > INT_MAX) {
7303 chunk_size = INT_MAX;
7304 final = 0;
7305 done = 0;
7306 }
7307 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 {
7310 chunk_size = (int)size;
7311 final = (consumed == NULL);
7312 done = 1;
7313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
Victor Stinner76a31a62011-11-04 00:05:13 +01007315 if (chunk_size == 0 && done) {
7316 if (v != NULL)
7317 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007318 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 converted = decode_code_page_strict(code_page, &v,
7322 s, chunk_size);
7323 if (converted == -2)
7324 converted = decode_code_page_errors(code_page, &v,
7325 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007326 errors, final);
7327 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007328
7329 if (converted < 0) {
7330 Py_XDECREF(v);
7331 return NULL;
7332 }
7333
7334 if (consumed)
7335 *consumed += converted;
7336
7337 s += converted;
7338 size -= converted;
7339 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007340
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007341 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342}
7343
Alexander Belopolsky40018472011-02-26 01:02:56 +00007344PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007345PyUnicode_DecodeCodePageStateful(int code_page,
7346 const char *s,
7347 Py_ssize_t size,
7348 const char *errors,
7349 Py_ssize_t *consumed)
7350{
7351 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7352}
7353
7354PyObject *
7355PyUnicode_DecodeMBCSStateful(const char *s,
7356 Py_ssize_t size,
7357 const char *errors,
7358 Py_ssize_t *consumed)
7359{
7360 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7361}
7362
7363PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007364PyUnicode_DecodeMBCS(const char *s,
7365 Py_ssize_t size,
7366 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7369}
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371static DWORD
7372encode_code_page_flags(UINT code_page, const char *errors)
7373{
7374 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007375 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 }
7377 else if (code_page == CP_UTF7) {
7378 /* CP_UTF7 only supports flags=0 */
7379 return 0;
7380 }
7381 else {
7382 if (errors != NULL && strcmp(errors, "replace") == 0)
7383 return 0;
7384 else
7385 return WC_NO_BEST_FIT_CHARS;
7386 }
7387}
7388
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 * Encode a Unicode string to a Windows code page into a byte string in strict
7391 * mode.
7392 *
7393 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007394 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007396static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007397encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400{
Victor Stinner554f3f02010-06-16 23:33:54 +00007401 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 BOOL *pusedDefaultChar = &usedDefaultChar;
7403 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007404 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 const DWORD flags = encode_code_page_flags(code_page, NULL);
7407 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 /* Create a substring so that we can get the UTF-16 representation
7409 of just the slice under consideration. */
7410 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411
Martin v. Löwis3d325192011-11-04 18:23:06 +01007412 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007415 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007417 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007418
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 substring = PyUnicode_Substring(unicode, offset, offset+len);
7420 if (substring == NULL)
7421 return -1;
7422 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7423 if (p == NULL) {
7424 Py_DECREF(substring);
7425 return -1;
7426 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007427 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007429 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007431 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 NULL, 0,
7433 NULL, pusedDefaultChar);
7434 if (outsize <= 0)
7435 goto error;
7436 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 if (pusedDefaultChar && *pusedDefaultChar) {
7438 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 if (*outbytes == NULL) {
7446 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450 }
7451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const Py_ssize_t n = PyBytes_Size(*outbytes);
7454 if (outsize > PY_SSIZE_T_MAX - n) {
7455 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464 }
7465
7466 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007468 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 out, outsize,
7470 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 if (outsize <= 0)
7473 goto error;
7474 if (pusedDefaultChar && *pusedDefaultChar)
7475 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481 return -2;
7482 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007483 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007484}
7485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007487 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 * error handler.
7489 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007490 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 * -1 on other error.
7492 */
7493static int
7494encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007497{
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 Py_ssize_t pos = unicode_offset;
7500 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 /* Ideally, we should get reason from FormatMessage. This is the Windows
7502 2000 English version of the message. */
7503 const char *reason = "invalid character";
7504 /* 4=maximum length of a UTF-8 sequence */
7505 char buffer[4];
7506 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7507 Py_ssize_t outsize;
7508 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 PyObject *errorHandler = NULL;
7510 PyObject *exc = NULL;
7511 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007512 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 PyObject *rep;
7515 int ret = -1;
7516
7517 assert(insize > 0);
7518
7519 encoding = code_page_name(code_page, &encoding_obj);
7520 if (encoding == NULL)
7521 return -1;
7522
7523 if (errors == NULL || strcmp(errors, "strict") == 0) {
7524 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7525 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007526 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (exc != NULL) {
7528 PyCodec_StrictErrors(exc);
7529 Py_DECREF(exc);
7530 }
7531 Py_XDECREF(encoding_obj);
7532 return -1;
7533 }
7534
7535 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7536 pusedDefaultChar = &usedDefaultChar;
7537 else
7538 pusedDefaultChar = NULL;
7539
7540 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7541 PyErr_NoMemory();
7542 goto error;
7543 }
7544 outsize = insize * Py_ARRAY_LENGTH(buffer);
7545
7546 if (*outbytes == NULL) {
7547 /* Create string object */
7548 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7549 if (*outbytes == NULL)
7550 goto error;
7551 out = PyBytes_AS_STRING(*outbytes);
7552 }
7553 else {
7554 /* Extend string object */
7555 Py_ssize_t n = PyBytes_Size(*outbytes);
7556 if (n > PY_SSIZE_T_MAX - outsize) {
7557 PyErr_NoMemory();
7558 goto error;
7559 }
7560 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7561 goto error;
7562 out = PyBytes_AS_STRING(*outbytes) + n;
7563 }
7564
7565 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7569 wchar_t chars[2];
7570 int charsize;
7571 if (ch < 0x10000) {
7572 chars[0] = (wchar_t)ch;
7573 charsize = 1;
7574 }
7575 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007576 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7577 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 charsize = 2;
7579 }
7580
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 buffer, Py_ARRAY_LENGTH(buffer),
7584 NULL, pusedDefaultChar);
7585 if (outsize > 0) {
7586 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7587 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 memcpy(out, buffer, outsize);
7590 out += outsize;
7591 continue;
7592 }
7593 }
7594 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7595 PyErr_SetFromWindowsErr(0);
7596 goto error;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 rep = unicode_encode_call_errorhandler(
7600 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007601 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 if (rep == NULL)
7604 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606
7607 if (PyBytes_Check(rep)) {
7608 outsize = PyBytes_GET_SIZE(rep);
7609 if (outsize != 1) {
7610 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613 Py_DECREF(rep);
7614 goto error;
7615 }
7616 out = PyBytes_AS_STRING(*outbytes) + offset;
7617 }
7618 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7619 out += outsize;
7620 }
7621 else {
7622 Py_ssize_t i;
7623 enum PyUnicode_Kind kind;
7624 void *data;
7625
Benjamin Petersonbac79492012-01-14 13:34:47 -05007626 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 Py_DECREF(rep);
7628 goto error;
7629 }
7630
7631 outsize = PyUnicode_GET_LENGTH(rep);
7632 if (outsize != 1) {
7633 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636 Py_DECREF(rep);
7637 goto error;
7638 }
7639 out = PyBytes_AS_STRING(*outbytes) + offset;
7640 }
7641 kind = PyUnicode_KIND(rep);
7642 data = PyUnicode_DATA(rep);
7643 for (i=0; i < outsize; i++) {
7644 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7645 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007646 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007647 encoding, unicode,
7648 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 "unable to encode error handler result to ASCII");
7650 Py_DECREF(rep);
7651 goto error;
7652 }
7653 *out = (unsigned char)ch;
7654 out++;
7655 }
7656 }
7657 Py_DECREF(rep);
7658 }
7659 /* write a NUL byte */
7660 *out = 0;
7661 outsize = out - PyBytes_AS_STRING(*outbytes);
7662 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7663 if (_PyBytes_Resize(outbytes, outsize) < 0)
7664 goto error;
7665 ret = 0;
7666
7667error:
7668 Py_XDECREF(encoding_obj);
7669 Py_XDECREF(errorHandler);
7670 Py_XDECREF(exc);
7671 return ret;
7672}
7673
Victor Stinner3a50e702011-10-18 21:21:00 +02007674static PyObject *
7675encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 const char *errors)
7678{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007681 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007682 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007683
Victor Stinner29dacf22015-01-26 16:41:32 +01007684 if (!PyUnicode_Check(unicode)) {
7685 PyErr_BadArgument();
7686 return NULL;
7687 }
7688
Benjamin Petersonbac79492012-01-14 13:34:47 -05007689 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 return NULL;
7691 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 if (code_page < 0) {
7694 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7695 return NULL;
7696 }
7697
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 return PyBytes_FromStringAndSize(NULL, 0);
7700
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 offset = 0;
7702 do
7703 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 chunks. */
7707 if (len > INT_MAX/2) {
7708 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 done = 0;
7710 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007711 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007713 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 done = 1;
7716 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717
Victor Stinner76a31a62011-11-04 00:05:13 +01007718 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007720 errors);
7721 if (ret == -2)
7722 ret = encode_code_page_errors(code_page, &outbytes,
7723 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007725 if (ret < 0) {
7726 Py_XDECREF(outbytes);
7727 return NULL;
7728 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007733
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 return outbytes;
7735}
7736
7737PyObject *
7738PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7739 Py_ssize_t size,
7740 const char *errors)
7741{
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007743 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 if (unicode == NULL)
7745 return NULL;
7746 res = encode_code_page(CP_ACP, unicode, errors);
7747 Py_DECREF(unicode);
7748 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007749}
7750
7751PyObject *
7752PyUnicode_EncodeCodePage(int code_page,
7753 PyObject *unicode,
7754 const char *errors)
7755{
Victor Stinner7581cef2011-11-03 22:32:33 +01007756 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007757}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007758
Alexander Belopolsky40018472011-02-26 01:02:56 +00007759PyObject *
7760PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007761{
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007763}
7764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#undef NEED_RETRY
7766
Steve Dowercc16be82016-09-08 10:35:16 -07007767#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007768
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769/* --- Character Mapping Codec -------------------------------------------- */
7770
Victor Stinnerfb161b12013-04-18 01:44:27 +02007771static int
7772charmap_decode_string(const char *s,
7773 Py_ssize_t size,
7774 PyObject *mapping,
7775 const char *errors,
7776 _PyUnicodeWriter *writer)
7777{
7778 const char *starts = s;
7779 const char *e;
7780 Py_ssize_t startinpos, endinpos;
7781 PyObject *errorHandler = NULL, *exc = NULL;
7782 Py_ssize_t maplen;
7783 enum PyUnicode_Kind mapkind;
7784 void *mapdata;
7785 Py_UCS4 x;
7786 unsigned char ch;
7787
7788 if (PyUnicode_READY(mapping) == -1)
7789 return -1;
7790
7791 maplen = PyUnicode_GET_LENGTH(mapping);
7792 mapdata = PyUnicode_DATA(mapping);
7793 mapkind = PyUnicode_KIND(mapping);
7794
7795 e = s + size;
7796
7797 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7798 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7799 * is disabled in encoding aliases, latin1 is preferred because
7800 * its implementation is faster. */
7801 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7802 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803 Py_UCS4 maxchar = writer->maxchar;
7804
7805 assert (writer->kind == PyUnicode_1BYTE_KIND);
7806 while (s < e) {
7807 ch = *s;
7808 x = mapdata_ucs1[ch];
7809 if (x > maxchar) {
7810 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7811 goto onError;
7812 maxchar = writer->maxchar;
7813 outdata = (Py_UCS1 *)writer->data;
7814 }
7815 outdata[writer->pos] = x;
7816 writer->pos++;
7817 ++s;
7818 }
7819 return 0;
7820 }
7821
7822 while (s < e) {
7823 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7824 enum PyUnicode_Kind outkind = writer->kind;
7825 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7826 if (outkind == PyUnicode_1BYTE_KIND) {
7827 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828 Py_UCS4 maxchar = writer->maxchar;
7829 while (s < e) {
7830 ch = *s;
7831 x = mapdata_ucs2[ch];
7832 if (x > maxchar)
7833 goto Error;
7834 outdata[writer->pos] = x;
7835 writer->pos++;
7836 ++s;
7837 }
7838 break;
7839 }
7840 else if (outkind == PyUnicode_2BYTE_KIND) {
7841 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7842 while (s < e) {
7843 ch = *s;
7844 x = mapdata_ucs2[ch];
7845 if (x == 0xFFFE)
7846 goto Error;
7847 outdata[writer->pos] = x;
7848 writer->pos++;
7849 ++s;
7850 }
7851 break;
7852 }
7853 }
7854 ch = *s;
7855
7856 if (ch < maplen)
7857 x = PyUnicode_READ(mapkind, mapdata, ch);
7858 else
7859 x = 0xfffe; /* invalid value */
7860Error:
7861 if (x == 0xfffe)
7862 {
7863 /* undefined mapping */
7864 startinpos = s-starts;
7865 endinpos = startinpos+1;
7866 if (unicode_decode_call_errorhandler_writer(
7867 errors, &errorHandler,
7868 "charmap", "character maps to <undefined>",
7869 &starts, &e, &startinpos, &endinpos, &exc, &s,
7870 writer)) {
7871 goto onError;
7872 }
7873 continue;
7874 }
7875
7876 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7877 goto onError;
7878 ++s;
7879 }
7880 Py_XDECREF(errorHandler);
7881 Py_XDECREF(exc);
7882 return 0;
7883
7884onError:
7885 Py_XDECREF(errorHandler);
7886 Py_XDECREF(exc);
7887 return -1;
7888}
7889
7890static int
7891charmap_decode_mapping(const char *s,
7892 Py_ssize_t size,
7893 PyObject *mapping,
7894 const char *errors,
7895 _PyUnicodeWriter *writer)
7896{
7897 const char *starts = s;
7898 const char *e;
7899 Py_ssize_t startinpos, endinpos;
7900 PyObject *errorHandler = NULL, *exc = NULL;
7901 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007902 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007903
7904 e = s + size;
7905
7906 while (s < e) {
7907 ch = *s;
7908
7909 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7910 key = PyLong_FromLong((long)ch);
7911 if (key == NULL)
7912 goto onError;
7913
7914 item = PyObject_GetItem(mapping, key);
7915 Py_DECREF(key);
7916 if (item == NULL) {
7917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7918 /* No mapping found means: mapping is undefined. */
7919 PyErr_Clear();
7920 goto Undefined;
7921 } else
7922 goto onError;
7923 }
7924
7925 /* Apply mapping */
7926 if (item == Py_None)
7927 goto Undefined;
7928 if (PyLong_Check(item)) {
7929 long value = PyLong_AS_LONG(item);
7930 if (value == 0xFFFE)
7931 goto Undefined;
7932 if (value < 0 || value > MAX_UNICODE) {
7933 PyErr_Format(PyExc_TypeError,
7934 "character mapping must be in range(0x%lx)",
7935 (unsigned long)MAX_UNICODE + 1);
7936 goto onError;
7937 }
7938
7939 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940 goto onError;
7941 }
7942 else if (PyUnicode_Check(item)) {
7943 if (PyUnicode_READY(item) == -1)
7944 goto onError;
7945 if (PyUnicode_GET_LENGTH(item) == 1) {
7946 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7947 if (value == 0xFFFE)
7948 goto Undefined;
7949 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7950 goto onError;
7951 }
7952 else {
7953 writer->overallocate = 1;
7954 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7955 goto onError;
7956 }
7957 }
7958 else {
7959 /* wrong return value */
7960 PyErr_SetString(PyExc_TypeError,
7961 "character mapping must return integer, None or str");
7962 goto onError;
7963 }
7964 Py_CLEAR(item);
7965 ++s;
7966 continue;
7967
7968Undefined:
7969 /* undefined mapping */
7970 Py_CLEAR(item);
7971 startinpos = s-starts;
7972 endinpos = startinpos+1;
7973 if (unicode_decode_call_errorhandler_writer(
7974 errors, &errorHandler,
7975 "charmap", "character maps to <undefined>",
7976 &starts, &e, &startinpos, &endinpos, &exc, &s,
7977 writer)) {
7978 goto onError;
7979 }
7980 }
7981 Py_XDECREF(errorHandler);
7982 Py_XDECREF(exc);
7983 return 0;
7984
7985onError:
7986 Py_XDECREF(item);
7987 Py_XDECREF(errorHandler);
7988 Py_XDECREF(exc);
7989 return -1;
7990}
7991
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992PyObject *
7993PyUnicode_DecodeCharmap(const char *s,
7994 Py_ssize_t size,
7995 PyObject *mapping,
7996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007998 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007999
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 /* Default to Latin-1 */
8001 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008005 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008006 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008007 writer.min_length = size;
8008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008010
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008011 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008012 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8013 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008014 }
8015 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008016 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008020
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 return NULL;
8024}
8025
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026/* Charmap encoding: the lookup table */
8027
Alexander Belopolsky40018472011-02-26 01:02:56 +00008028struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 PyObject_HEAD
8030 unsigned char level1[32];
8031 int count2, count3;
8032 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033};
8034
8035static PyObject*
8036encoding_map_size(PyObject *obj, PyObject* args)
8037{
8038 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041}
8042
8043static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 PyDoc_STR("Return the size (in bytes) of this object") },
8046 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047};
8048
8049static void
8050encoding_map_dealloc(PyObject* o)
8051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053}
8054
8055static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 "EncodingMap", /*tp_name*/
8058 sizeof(struct encoding_map), /*tp_basicsize*/
8059 0, /*tp_itemsize*/
8060 /* methods */
8061 encoding_map_dealloc, /*tp_dealloc*/
8062 0, /*tp_print*/
8063 0, /*tp_getattr*/
8064 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008065 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 0, /*tp_repr*/
8067 0, /*tp_as_number*/
8068 0, /*tp_as_sequence*/
8069 0, /*tp_as_mapping*/
8070 0, /*tp_hash*/
8071 0, /*tp_call*/
8072 0, /*tp_str*/
8073 0, /*tp_getattro*/
8074 0, /*tp_setattro*/
8075 0, /*tp_as_buffer*/
8076 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8077 0, /*tp_doc*/
8078 0, /*tp_traverse*/
8079 0, /*tp_clear*/
8080 0, /*tp_richcompare*/
8081 0, /*tp_weaklistoffset*/
8082 0, /*tp_iter*/
8083 0, /*tp_iternext*/
8084 encoding_map_methods, /*tp_methods*/
8085 0, /*tp_members*/
8086 0, /*tp_getset*/
8087 0, /*tp_base*/
8088 0, /*tp_dict*/
8089 0, /*tp_descr_get*/
8090 0, /*tp_descr_set*/
8091 0, /*tp_dictoffset*/
8092 0, /*tp_init*/
8093 0, /*tp_alloc*/
8094 0, /*tp_new*/
8095 0, /*tp_free*/
8096 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097};
8098
8099PyObject*
8100PyUnicode_BuildEncodingMap(PyObject* string)
8101{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 PyObject *result;
8103 struct encoding_map *mresult;
8104 int i;
8105 int need_dict = 0;
8106 unsigned char level1[32];
8107 unsigned char level2[512];
8108 unsigned char *mlevel1, *mlevel2, *mlevel3;
8109 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 int kind;
8111 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008112 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008115 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 PyErr_BadArgument();
8117 return NULL;
8118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 kind = PyUnicode_KIND(string);
8120 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008121 length = PyUnicode_GET_LENGTH(string);
8122 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 memset(level1, 0xFF, sizeof level1);
8124 memset(level2, 0xFF, sizeof level2);
8125
8126 /* If there isn't a one-to-one mapping of NULL to \0,
8127 or if there are non-BMP characters, we need to use
8128 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008130 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008131 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 ch = PyUnicode_READ(kind, data, i);
8134 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 need_dict = 1;
8136 break;
8137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 /* unmapped character */
8140 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 l1 = ch >> 11;
8142 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 if (level1[l1] == 0xFF)
8144 level1[l1] = count2++;
8145 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 }
8148
8149 if (count2 >= 0xFF || count3 >= 0xFF)
8150 need_dict = 1;
8151
8152 if (need_dict) {
8153 PyObject *result = PyDict_New();
8154 PyObject *key, *value;
8155 if (!result)
8156 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008157 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008159 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (!key || !value)
8161 goto failed1;
8162 if (PyDict_SetItem(result, key, value) == -1)
8163 goto failed1;
8164 Py_DECREF(key);
8165 Py_DECREF(value);
8166 }
8167 return result;
8168 failed1:
8169 Py_XDECREF(key);
8170 Py_XDECREF(value);
8171 Py_DECREF(result);
8172 return NULL;
8173 }
8174
8175 /* Create a three-level trie */
8176 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8177 16*count2 + 128*count3 - 1);
8178 if (!result)
8179 return PyErr_NoMemory();
8180 PyObject_Init(result, &EncodingMapType);
8181 mresult = (struct encoding_map*)result;
8182 mresult->count2 = count2;
8183 mresult->count3 = count3;
8184 mlevel1 = mresult->level1;
8185 mlevel2 = mresult->level23;
8186 mlevel3 = mresult->level23 + 16*count2;
8187 memcpy(mlevel1, level1, 32);
8188 memset(mlevel2, 0xFF, 16*count2);
8189 memset(mlevel3, 0, 128*count3);
8190 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008191 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008193 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8194 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 /* unmapped character */
8196 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008197 o1 = ch>>11;
8198 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008199 i2 = 16*mlevel1[o1] + o2;
8200 if (mlevel2[i2] == 0xFF)
8201 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008202 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203 i3 = 128*mlevel2[i2] + o3;
8204 mlevel3[i3] = i;
8205 }
8206 return result;
8207}
8208
8209static int
Victor Stinner22168992011-11-20 17:09:18 +01008210encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211{
8212 struct encoding_map *map = (struct encoding_map*)mapping;
8213 int l1 = c>>11;
8214 int l2 = (c>>7) & 0xF;
8215 int l3 = c & 0x7F;
8216 int i;
8217
Victor Stinner22168992011-11-20 17:09:18 +01008218 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 if (c == 0)
8221 return 0;
8222 /* level 1*/
8223 i = map->level1[l1];
8224 if (i == 0xFF) {
8225 return -1;
8226 }
8227 /* level 2*/
8228 i = map->level23[16*i+l2];
8229 if (i == 0xFF) {
8230 return -1;
8231 }
8232 /* level 3 */
8233 i = map->level23[16*map->count2 + 128*i + l3];
8234 if (i == 0) {
8235 return -1;
8236 }
8237 return i;
8238}
8239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240/* Lookup the character ch in the mapping. If the character
8241 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008242 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008244charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Christian Heimes217cfd12007-12-02 14:31:20 +00008246 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *x;
8248
8249 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 x = PyObject_GetItem(mapping, w);
8252 Py_DECREF(w);
8253 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255 /* No mapping found means: mapping is undefined. */
8256 PyErr_Clear();
8257 x = Py_None;
8258 Py_INCREF(x);
8259 return x;
8260 } else
8261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008263 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008265 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 long value = PyLong_AS_LONG(x);
8267 if (value < 0 || value > 255) {
8268 PyErr_SetString(PyExc_TypeError,
8269 "character mapping must be in range(256)");
8270 Py_DECREF(x);
8271 return NULL;
8272 }
8273 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008275 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 /* wrong return value */
8279 PyErr_Format(PyExc_TypeError,
8280 "character mapping must return integer, bytes or None, not %.400s",
8281 x->ob_type->tp_name);
8282 Py_DECREF(x);
8283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
8285}
8286
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008288charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008290 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8291 /* exponentially overallocate to minimize reallocations */
8292 if (requiredsize < 2*outsize)
8293 requiredsize = 2*outsize;
8294 if (_PyBytes_Resize(outobj, requiredsize))
8295 return -1;
8296 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297}
8298
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008301} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008303 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 space is available. Return a new reference to the object that
8305 was put in the output buffer, or Py_None, if the mapping was undefined
8306 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008307 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008309charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 PyObject *rep;
8313 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008314 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315
Christian Heimes90aa7642007-12-19 02:45:37 +00008316 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 if (res == -1)
8320 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 if (outsize<requiredsize)
8322 if (charmapencode_resize(outobj, outpos, requiredsize))
8323 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008324 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 outstart[(*outpos)++] = (char)res;
8326 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 }
8328
8329 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_DECREF(rep);
8334 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (PyLong_Check(rep)) {
8337 Py_ssize_t requiredsize = *outpos+1;
8338 if (outsize<requiredsize)
8339 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8340 Py_DECREF(rep);
8341 return enc_EXCEPTION;
8342 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008343 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 else {
8347 const char *repchars = PyBytes_AS_STRING(rep);
8348 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8349 Py_ssize_t requiredsize = *outpos+repsize;
8350 if (outsize<requiredsize)
8351 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8352 Py_DECREF(rep);
8353 return enc_EXCEPTION;
8354 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008355 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 memcpy(outstart + *outpos, repchars, repsize);
8357 *outpos += repsize;
8358 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008360 Py_DECREF(rep);
8361 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362}
8363
8364/* handle an error in PyUnicode_EncodeCharmap
8365 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366static int
8367charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008370 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008371 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372{
8373 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008374 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008375 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008376 enum PyUnicode_Kind kind;
8377 void *data;
8378 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 Py_ssize_t collstartpos = *inpos;
8381 Py_ssize_t collendpos = *inpos+1;
8382 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 char *encoding = "charmap";
8384 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008387 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388
Benjamin Petersonbac79492012-01-14 13:34:47 -05008389 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 return -1;
8391 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 /* find all unencodable characters */
8393 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008394 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008395 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008397 val = encoding_map_lookup(ch, mapping);
8398 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 break;
8400 ++collendpos;
8401 continue;
8402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8405 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 if (rep==NULL)
8407 return -1;
8408 else if (rep!=Py_None) {
8409 Py_DECREF(rep);
8410 break;
8411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 }
8415 /* cache callback name lookup
8416 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008417 if (*error_handler == _Py_ERROR_UNKNOWN)
8418 *error_handler = get_error_handler(errors);
8419
8420 switch (*error_handler) {
8421 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008422 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008424
8425 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 x = charmapencode_output('?', mapping, res, respos);
8428 if (x==enc_EXCEPTION) {
8429 return -1;
8430 }
8431 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008432 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return -1;
8434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 }
8436 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008437 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 *inpos = collendpos;
8439 break;
Victor Stinner50149202015-09-22 00:26:54 +02008440
8441 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 /* generate replacement (temporarily (mis)uses p) */
8443 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 char buffer[2+29+1+1];
8445 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 for (cp = buffer; *cp; ++cp) {
8448 x = charmapencode_output(*cp, mapping, res, respos);
8449 if (x==enc_EXCEPTION)
8450 return -1;
8451 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008452 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return -1;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
8456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 *inpos = collendpos;
8458 break;
Victor Stinner50149202015-09-22 00:26:54 +02008459
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 default:
Victor Stinner50149202015-09-22 00:26:54 +02008461 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008466 if (PyBytes_Check(repunicode)) {
8467 /* Directly copy bytes result to output. */
8468 Py_ssize_t outsize = PyBytes_Size(*res);
8469 Py_ssize_t requiredsize;
8470 repsize = PyBytes_Size(repunicode);
8471 requiredsize = *respos + repsize;
8472 if (requiredsize > outsize)
8473 /* Make room for all additional bytes. */
8474 if (charmapencode_resize(res, respos, requiredsize)) {
8475 Py_DECREF(repunicode);
8476 return -1;
8477 }
8478 memcpy(PyBytes_AsString(*res) + *respos,
8479 PyBytes_AsString(repunicode), repsize);
8480 *respos += repsize;
8481 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008482 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008483 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008486 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008487 Py_DECREF(repunicode);
8488 return -1;
8489 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008490 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008491 data = PyUnicode_DATA(repunicode);
8492 kind = PyUnicode_KIND(repunicode);
8493 for (index = 0; index < repsize; index++) {
8494 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8495 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008497 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return -1;
8499 }
8500 else if (x==enc_FAILED) {
8501 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008502 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return -1;
8504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 }
8506 *inpos = newpos;
8507 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 }
8509 return 0;
8510}
8511
Alexander Belopolsky40018472011-02-26 01:02:56 +00008512PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513_PyUnicode_EncodeCharmap(PyObject *unicode,
8514 PyObject *mapping,
8515 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 /* output object */
8518 PyObject *res = NULL;
8519 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008520 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008522 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008524 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008526 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008527 void *data;
8528 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Benjamin Petersonbac79492012-01-14 13:34:47 -05008530 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008531 return NULL;
8532 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008533 data = PyUnicode_DATA(unicode);
8534 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008535
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 /* Default to Latin-1 */
8537 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 /* allocate enough for a simple encoding without
8541 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008542 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 if (res == NULL)
8544 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008545 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008549 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (x==enc_EXCEPTION) /* error */
8553 goto onError;
8554 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008557 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 &res, &respos)) {
8559 goto onError;
8560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 else
8563 /* done with this character => adjust input position */
8564 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008568 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008569 if (_PyBytes_Resize(&res, respos) < 0)
8570 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008573 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 return res;
8575
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 Py_XDECREF(res);
8578 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008579 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 return NULL;
8581}
8582
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583/* Deprecated */
8584PyObject *
8585PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8586 Py_ssize_t size,
8587 PyObject *mapping,
8588 const char *errors)
8589{
8590 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008591 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008592 if (unicode == NULL)
8593 return NULL;
8594 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8595 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008596 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008597}
8598
Alexander Belopolsky40018472011-02-26 01:02:56 +00008599PyObject *
8600PyUnicode_AsCharmapString(PyObject *unicode,
8601 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602{
8603 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 PyErr_BadArgument();
8605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608}
8609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008611static void
8612make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614 Py_ssize_t startpos, Py_ssize_t endpos,
8615 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 *exceptionObject = _PyUnicodeTranslateError_Create(
8619 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 }
8621 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8623 goto onError;
8624 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8625 goto onError;
8626 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8627 goto onError;
8628 return;
8629 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008630 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 }
8632}
8633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634/* error handling callback helper:
8635 build arguments, call the callback and check the arguments,
8636 put the result into newpos and return the replacement string, which
8637 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638static PyObject *
8639unicode_translate_call_errorhandler(const char *errors,
8640 PyObject **errorHandler,
8641 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643 Py_ssize_t startpos, Py_ssize_t endpos,
8644 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008646 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008648 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 PyObject *restuple;
8650 PyObject *resunicode;
8651
8652 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 }
8657
8658 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008663 restuple = PyObject_CallFunctionObjArgs(
8664 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008668 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 Py_DECREF(restuple);
8670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008672 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 &resunicode, &i_newpos)) {
8674 Py_DECREF(restuple);
8675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008677 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008679 else
8680 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008682 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 Py_DECREF(restuple);
8684 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 Py_INCREF(resunicode);
8687 Py_DECREF(restuple);
8688 return resunicode;
8689}
8690
8691/* Lookup the character ch in the mapping and put the result in result,
8692 which must be decrefed by the caller.
8693 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696{
Christian Heimes217cfd12007-12-02 14:31:20 +00008697 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 PyObject *x;
8699
8700 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 x = PyObject_GetItem(mapping, w);
8703 Py_DECREF(w);
8704 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8706 /* No mapping found means: use 1:1 mapping. */
8707 PyErr_Clear();
8708 *result = NULL;
8709 return 0;
8710 } else
8711 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 }
8713 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 *result = x;
8715 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008719 if (value < 0 || value > MAX_UNICODE) {
8720 PyErr_Format(PyExc_ValueError,
8721 "character mapping must be in range(0x%x)",
8722 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 Py_DECREF(x);
8724 return -1;
8725 }
8726 *result = x;
8727 return 0;
8728 }
8729 else if (PyUnicode_Check(x)) {
8730 *result = x;
8731 return 0;
8732 }
8733 else {
8734 /* wrong return value */
8735 PyErr_SetString(PyExc_TypeError,
8736 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008737 Py_DECREF(x);
8738 return -1;
8739 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740}
Victor Stinner1194ea02014-04-04 19:37:40 +02008741
8742/* lookup the character, write the result into the writer.
8743 Return 1 if the result was written into the writer, return 0 if the mapping
8744 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008746charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8747 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748{
Victor Stinner1194ea02014-04-04 19:37:40 +02008749 PyObject *item;
8750
8751 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008753
8754 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008756 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008759 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008761
8762 if (item == Py_None) {
8763 Py_DECREF(item);
8764 return 0;
8765 }
8766
8767 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008768 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8769 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8770 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8772 Py_DECREF(item);
8773 return -1;
8774 }
8775 Py_DECREF(item);
8776 return 1;
8777 }
8778
8779 if (!PyUnicode_Check(item)) {
8780 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008782 }
8783
8784 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8785 Py_DECREF(item);
8786 return -1;
8787 }
8788
8789 Py_DECREF(item);
8790 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791}
8792
Victor Stinner89a76ab2014-04-05 11:44:04 +02008793static int
8794unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8795 Py_UCS1 *translate)
8796{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008797 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008798 int ret = 0;
8799
Victor Stinner89a76ab2014-04-05 11:44:04 +02008800 if (charmaptranslate_lookup(ch, mapping, &item)) {
8801 return -1;
8802 }
8803
8804 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008805 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008806 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008808 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809 /* not found => default to 1:1 mapping */
8810 translate[ch] = ch;
8811 return 1;
8812 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008813 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008814 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008815 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8816 used it */
8817 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 /* invalid character or character outside ASCII:
8819 skip the fast translate */
8820 goto exit;
8821 }
8822 translate[ch] = (Py_UCS1)replace;
8823 }
8824 else if (PyUnicode_Check(item)) {
8825 Py_UCS4 replace;
8826
8827 if (PyUnicode_READY(item) == -1) {
8828 Py_DECREF(item);
8829 return -1;
8830 }
8831 if (PyUnicode_GET_LENGTH(item) != 1)
8832 goto exit;
8833
8834 replace = PyUnicode_READ_CHAR(item, 0);
8835 if (replace > 127)
8836 goto exit;
8837 translate[ch] = (Py_UCS1)replace;
8838 }
8839 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008840 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 goto exit;
8842 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 ret = 1;
8844
Benjamin Peterson1365de72014-04-07 20:15:41 -04008845 exit:
8846 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847 return ret;
8848}
8849
8850/* Fast path for ascii => ascii translation. Return 1 if the whole string
8851 was translated into writer, return 0 if the input string was partially
8852 translated into writer, raise an exception and return -1 on error. */
8853static int
8854unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008855 _PyUnicodeWriter *writer, int ignore,
8856 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857{
Victor Stinner872b2912014-04-05 14:27:07 +02008858 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 Py_ssize_t len;
8860 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008861 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 len = PyUnicode_GET_LENGTH(input);
8864
Victor Stinner872b2912014-04-05 14:27:07 +02008865 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866
8867 in = PyUnicode_1BYTE_DATA(input);
8868 end = in + len;
8869
8870 assert(PyUnicode_IS_ASCII(writer->buffer));
8871 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8872 out = PyUnicode_1BYTE_DATA(writer->buffer);
8873
Victor Stinner872b2912014-04-05 14:27:07 +02008874 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008876 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008878 int translate = unicode_fast_translate_lookup(mapping, ch,
8879 ascii_table);
8880 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008882 if (translate == 0)
8883 goto exit;
8884 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008885 }
Victor Stinner872b2912014-04-05 14:27:07 +02008886 if (ch2 == 0xfe) {
8887 if (ignore)
8888 continue;
8889 goto exit;
8890 }
8891 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008893 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 }
Victor Stinner872b2912014-04-05 14:27:07 +02008895 res = 1;
8896
8897exit:
8898 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008899 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008900 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901}
8902
Victor Stinner3222da22015-10-01 22:07:32 +02008903static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904_PyUnicode_TranslateCharmap(PyObject *input,
8905 PyObject *mapping,
8906 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 Py_ssize_t size, i;
8911 int kind;
8912 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 _PyUnicodeWriter writer;
8914 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 char *reason = "character maps to <undefined>";
8916 PyObject *errorHandler = NULL;
8917 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008918 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 PyErr_BadArgument();
8923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 if (PyUnicode_READY(input) == -1)
8927 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008928 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 kind = PyUnicode_KIND(input);
8930 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008932 if (size == 0)
8933 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935 /* allocate enough for a simple 1:1 translation without
8936 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008937 _PyUnicodeWriter_Init(&writer);
8938 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940
Victor Stinner872b2912014-04-05 14:27:07 +02008941 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8942
Victor Stinner33798672016-03-01 21:59:58 +01008943 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008945 if (PyUnicode_IS_ASCII(input)) {
8946 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8947 if (res < 0) {
8948 _PyUnicodeWriter_Dealloc(&writer);
8949 return NULL;
8950 }
8951 if (res == 1)
8952 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008953 }
Victor Stinner33798672016-03-01 21:59:58 +01008954 else {
8955 i = 0;
8956 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 int translate;
8961 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8962 Py_ssize_t newpos;
8963 /* startpos for collecting untranslatable chars */
8964 Py_ssize_t collstart;
8965 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967
Victor Stinner1194ea02014-04-04 19:37:40 +02008968 ch = PyUnicode_READ(kind, data, i);
8969 translate = charmaptranslate_output(ch, mapping, &writer);
8970 if (translate < 0)
8971 goto onError;
8972
8973 if (translate != 0) {
8974 /* it worked => adjust input pointer */
8975 ++i;
8976 continue;
8977 }
8978
8979 /* untranslatable character */
8980 collstart = i;
8981 collend = i+1;
8982
8983 /* find all untranslatable characters */
8984 while (collend < size) {
8985 PyObject *x;
8986 ch = PyUnicode_READ(kind, data, collend);
8987 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008988 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 Py_XDECREF(x);
8990 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008992 ++collend;
8993 }
8994
8995 if (ignore) {
8996 i = collend;
8997 }
8998 else {
8999 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9000 reason, input, &exc,
9001 collstart, collend, &newpos);
9002 if (repunicode == NULL)
9003 goto onError;
9004 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009006 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009007 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009008 Py_DECREF(repunicode);
9009 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009010 }
9011 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009012 Py_XDECREF(exc);
9013 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009018 Py_XDECREF(exc);
9019 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 return NULL;
9021}
9022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023/* Deprecated. Use PyUnicode_Translate instead. */
9024PyObject *
9025PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9026 Py_ssize_t size,
9027 PyObject *mapping,
9028 const char *errors)
9029{
Christian Heimes5f520f42012-09-11 14:03:25 +02009030 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009031 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 if (!unicode)
9033 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009034 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9035 Py_DECREF(unicode);
9036 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037}
9038
Alexander Belopolsky40018472011-02-26 01:02:56 +00009039PyObject *
9040PyUnicode_Translate(PyObject *str,
9041 PyObject *mapping,
9042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009044 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009045 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009046 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047}
Tim Petersced69f82003-09-16 20:30:58 +00009048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009050fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051{
9052 /* No need to call PyUnicode_READY(self) because this function is only
9053 called as a callback from fixup() which does it already. */
9054 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9055 const int kind = PyUnicode_KIND(self);
9056 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009057 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009058 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 Py_ssize_t i;
9060
9061 for (i = 0; i < len; ++i) {
9062 ch = PyUnicode_READ(kind, data, i);
9063 fixed = 0;
9064 if (ch > 127) {
9065 if (Py_UNICODE_ISSPACE(ch))
9066 fixed = ' ';
9067 else {
9068 const int decimal = Py_UNICODE_TODECIMAL(ch);
9069 if (decimal >= 0)
9070 fixed = '0' + decimal;
9071 }
9072 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009073 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009074 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 PyUnicode_WRITE(kind, data, i, fixed);
9076 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009077 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009078 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 }
9081
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009082 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083}
9084
9085PyObject *
9086_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9087{
9088 if (!PyUnicode_Check(unicode)) {
9089 PyErr_BadInternalCall();
9090 return NULL;
9091 }
9092 if (PyUnicode_READY(unicode) == -1)
9093 return NULL;
9094 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9095 /* If the string is already ASCII, just return the same string */
9096 Py_INCREF(unicode);
9097 return unicode;
9098 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009099 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100}
9101
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009102PyObject *
9103PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9104 Py_ssize_t length)
9105{
Victor Stinnerf0124502011-11-21 23:12:56 +01009106 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009107 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009108 Py_UCS4 maxchar;
9109 enum PyUnicode_Kind kind;
9110 void *data;
9111
Victor Stinner99d7ad02012-02-22 13:37:39 +01009112 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009113 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009114 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009115 if (ch > 127) {
9116 int decimal = Py_UNICODE_TODECIMAL(ch);
9117 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009118 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009119 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009120 }
9121 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009122
9123 /* Copy to a new string */
9124 decimal = PyUnicode_New(length, maxchar);
9125 if (decimal == NULL)
9126 return decimal;
9127 kind = PyUnicode_KIND(decimal);
9128 data = PyUnicode_DATA(decimal);
9129 /* Iterate over code points */
9130 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009131 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009132 if (ch > 127) {
9133 int decimal = Py_UNICODE_TODECIMAL(ch);
9134 if (decimal >= 0)
9135 ch = '0' + decimal;
9136 }
9137 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009139 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009140}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009141/* --- Decimal Encoder ---------------------------------------------------- */
9142
Alexander Belopolsky40018472011-02-26 01:02:56 +00009143int
9144PyUnicode_EncodeDecimal(Py_UNICODE *s,
9145 Py_ssize_t length,
9146 char *output,
9147 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009148{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009149 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009150 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009151 enum PyUnicode_Kind kind;
9152 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009153
9154 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 PyErr_BadArgument();
9156 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157 }
9158
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009159 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009160 if (unicode == NULL)
9161 return -1;
9162
Victor Stinner42bf7752011-11-21 22:52:58 +01009163 kind = PyUnicode_KIND(unicode);
9164 data = PyUnicode_DATA(unicode);
9165
Victor Stinnerb84d7232011-11-22 01:50:07 +01009166 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009167 PyObject *exc;
9168 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009170 Py_ssize_t startpos;
9171
9172 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009173
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009175 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009176 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009178 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 decimal = Py_UNICODE_TODECIMAL(ch);
9180 if (decimal >= 0) {
9181 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 continue;
9184 }
9185 if (0 < ch && ch < 256) {
9186 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009187 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 continue;
9189 }
Victor Stinner6345be92011-11-25 20:09:01 +01009190
Victor Stinner42bf7752011-11-21 22:52:58 +01009191 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009192 exc = NULL;
9193 raise_encode_exception(&exc, "decimal", unicode,
9194 startpos, startpos+1,
9195 "invalid decimal Unicode string");
9196 Py_XDECREF(exc);
9197 Py_DECREF(unicode);
9198 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009199 }
9200 /* 0-terminate the output string */
9201 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009202 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009203 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009204}
9205
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206/* --- Helpers ------------------------------------------------------------ */
9207
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009208/* helper macro to fixup start/end slice values */
9209#define ADJUST_INDICES(start, end, len) \
9210 if (end > len) \
9211 end = len; \
9212 else if (end < 0) { \
9213 end += len; \
9214 if (end < 0) \
9215 end = 0; \
9216 } \
9217 if (start < 0) { \
9218 start += len; \
9219 if (start < 0) \
9220 start = 0; \
9221 }
9222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009224any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009226 Py_ssize_t end,
9227 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009229 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 void *buf1, *buf2;
9231 Py_ssize_t len1, len2, result;
9232
9233 kind1 = PyUnicode_KIND(s1);
9234 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009235 if (kind1 < kind2)
9236 return -1;
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 len1 = PyUnicode_GET_LENGTH(s1);
9239 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009240 ADJUST_INDICES(start, end, len1);
9241 if (end - start < len2)
9242 return -1;
9243
9244 buf1 = PyUnicode_DATA(s1);
9245 buf2 = PyUnicode_DATA(s2);
9246 if (len2 == 1) {
9247 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9248 result = findchar((const char *)buf1 + kind1*start,
9249 kind1, end - start, ch, direction);
9250 if (result == -1)
9251 return -1;
9252 else
9253 return start + result;
9254 }
9255
9256 if (kind2 != kind1) {
9257 buf2 = _PyUnicode_AsKind(s2, kind1);
9258 if (!buf2)
9259 return -2;
9260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261
Victor Stinner794d5672011-10-10 03:21:36 +02009262 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009263 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009264 case PyUnicode_1BYTE_KIND:
9265 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9266 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9267 else
9268 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9269 break;
9270 case PyUnicode_2BYTE_KIND:
9271 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9272 break;
9273 case PyUnicode_4BYTE_KIND:
9274 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9275 break;
9276 default:
9277 assert(0); result = -2;
9278 }
9279 }
9280 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009282 case PyUnicode_1BYTE_KIND:
9283 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9284 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285 else
9286 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9287 break;
9288 case PyUnicode_2BYTE_KIND:
9289 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 case PyUnicode_4BYTE_KIND:
9292 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9293 break;
9294 default:
9295 assert(0); result = -2;
9296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 }
9298
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009299 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 PyMem_Free(buf2);
9301
9302 return result;
9303}
9304
9305Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009306_PyUnicode_InsertThousandsGrouping(
9307 PyObject *unicode, Py_ssize_t index,
9308 Py_ssize_t n_buffer,
9309 void *digits, Py_ssize_t n_digits,
9310 Py_ssize_t min_width,
9311 const char *grouping, PyObject *thousands_sep,
9312 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313{
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009315 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 Py_ssize_t thousands_sep_len;
9317 Py_ssize_t len;
9318
9319 if (unicode != NULL) {
9320 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009321 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009322 }
9323 else {
9324 kind = PyUnicode_1BYTE_KIND;
9325 data = NULL;
9326 }
9327 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9328 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9329 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9330 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009331 if (thousands_sep_kind < kind) {
9332 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9333 if (!thousands_sep_data)
9334 return -1;
9335 }
9336 else {
9337 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9338 if (!data)
9339 return -1;
9340 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009341 }
9342
Benjamin Petersonead6b532011-12-20 17:23:42 -06009343 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009345 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009348 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009349 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009350 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009352 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009354 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009360 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009366 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 break;
9368 default:
9369 assert(0);
9370 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009372 if (unicode != NULL && thousands_sep_kind != kind) {
9373 if (thousands_sep_kind < kind)
9374 PyMem_Free(thousands_sep_data);
9375 else
9376 PyMem_Free(data);
9377 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 if (unicode == NULL) {
9379 *maxchar = 127;
9380 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009381 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009382 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 }
9384 }
9385 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386}
9387
9388
Alexander Belopolsky40018472011-02-26 01:02:56 +00009389Py_ssize_t
9390PyUnicode_Count(PyObject *str,
9391 PyObject *substr,
9392 Py_ssize_t start,
9393 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009395 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009396 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 void *buf1 = NULL, *buf2 = NULL;
9398 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009399
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009400 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009402
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009403 kind1 = PyUnicode_KIND(str);
9404 kind2 = PyUnicode_KIND(substr);
9405 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009408 len1 = PyUnicode_GET_LENGTH(str);
9409 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009411 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009414 buf1 = PyUnicode_DATA(str);
9415 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009418 if (!buf2)
9419 goto onError;
9420 }
9421
9422 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009425 result = asciilib_count(
9426 ((Py_UCS1*)buf1) + start, end - start,
9427 buf2, len2, PY_SSIZE_T_MAX
9428 );
9429 else
9430 result = ucs1lib_count(
9431 ((Py_UCS1*)buf1) + start, end - start,
9432 buf2, len2, PY_SSIZE_T_MAX
9433 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 break;
9435 case PyUnicode_2BYTE_KIND:
9436 result = ucs2lib_count(
9437 ((Py_UCS2*)buf1) + start, end - start,
9438 buf2, len2, PY_SSIZE_T_MAX
9439 );
9440 break;
9441 case PyUnicode_4BYTE_KIND:
9442 result = ucs4lib_count(
9443 ((Py_UCS4*)buf1) + start, end - start,
9444 buf2, len2, PY_SSIZE_T_MAX
9445 );
9446 break;
9447 default:
9448 assert(0); result = 0;
9449 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009450
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009451 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 PyMem_Free(buf2);
9453
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009456 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 PyMem_Free(buf2);
9458 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459}
9460
Alexander Belopolsky40018472011-02-26 01:02:56 +00009461Py_ssize_t
9462PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009463 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009464 Py_ssize_t start,
9465 Py_ssize_t end,
9466 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009468 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009470
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009471 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472}
9473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474Py_ssize_t
9475PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9476 Py_ssize_t start, Py_ssize_t end,
9477 int direction)
9478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009480 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 if (PyUnicode_READY(str) == -1)
9482 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009483 len = PyUnicode_GET_LENGTH(str);
9484 ADJUST_INDICES(start, end, len);
9485 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009486 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009488 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9489 kind, end-start, ch, direction);
9490 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009492 else
9493 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494}
9495
Alexander Belopolsky40018472011-02-26 01:02:56 +00009496static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009497tailmatch(PyObject *self,
9498 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009499 Py_ssize_t start,
9500 Py_ssize_t end,
9501 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 int kind_self;
9504 int kind_sub;
9505 void *data_self;
9506 void *data_sub;
9507 Py_ssize_t offset;
9508 Py_ssize_t i;
9509 Py_ssize_t end_sub;
9510
9511 if (PyUnicode_READY(self) == -1 ||
9512 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009513 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9516 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009520 if (PyUnicode_GET_LENGTH(substring) == 0)
9521 return 1;
9522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 kind_self = PyUnicode_KIND(self);
9524 data_self = PyUnicode_DATA(self);
9525 kind_sub = PyUnicode_KIND(substring);
9526 data_sub = PyUnicode_DATA(substring);
9527 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9528
9529 if (direction > 0)
9530 offset = end;
9531 else
9532 offset = start;
9533
9534 if (PyUnicode_READ(kind_self, data_self, offset) ==
9535 PyUnicode_READ(kind_sub, data_sub, 0) &&
9536 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9537 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9538 /* If both are of the same kind, memcmp is sufficient */
9539 if (kind_self == kind_sub) {
9540 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009541 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 data_sub,
9543 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009544 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009546 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 else {
9548 /* We do not need to compare 0 and len(substring)-1 because
9549 the if statement above ensured already that they are equal
9550 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 for (i = 1; i < end_sub; ++i) {
9552 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9553 PyUnicode_READ(kind_sub, data_sub, i))
9554 return 0;
9555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 }
9559
9560 return 0;
9561}
9562
Alexander Belopolsky40018472011-02-26 01:02:56 +00009563Py_ssize_t
9564PyUnicode_Tailmatch(PyObject *str,
9565 PyObject *substr,
9566 Py_ssize_t start,
9567 Py_ssize_t end,
9568 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009570 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009572
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009573 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574}
9575
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576/* Apply fixfct filter to the Unicode object self and return a
9577 reference to the modified object */
9578
Alexander Belopolsky40018472011-02-26 01:02:56 +00009579static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009580fixup(PyObject *self,
9581 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 PyObject *u;
9584 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009585 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009587 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009590 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 /* fix functions return the new maximum character in a string,
9593 if the kind of the resulting unicode object does not change,
9594 everything is fine. Otherwise we need to change the string kind
9595 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009596 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009597
9598 if (maxchar_new == 0) {
9599 /* no changes */;
9600 if (PyUnicode_CheckExact(self)) {
9601 Py_DECREF(u);
9602 Py_INCREF(self);
9603 return self;
9604 }
9605 else
9606 return u;
9607 }
9608
Victor Stinnere6abb482012-05-02 01:15:40 +02009609 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610
Victor Stinnereaab6042011-12-11 22:22:39 +01009611 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009613
9614 /* In case the maximum character changed, we need to
9615 convert the string to the new category. */
9616 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9617 if (v == NULL) {
9618 Py_DECREF(u);
9619 return NULL;
9620 }
9621 if (maxchar_new > maxchar_old) {
9622 /* If the maxchar increased so that the kind changed, not all
9623 characters are representable anymore and we need to fix the
9624 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009625 _PyUnicode_FastCopyCharacters(v, 0,
9626 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009627 maxchar_old = fixfct(v);
9628 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 }
9630 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009631 _PyUnicode_FastCopyCharacters(v, 0,
9632 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009634 Py_DECREF(u);
9635 assert(_PyUnicode_CheckConsistency(v, 1));
9636 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639static PyObject *
9640ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9643 char *resdata, *data = PyUnicode_DATA(self);
9644 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009645
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646 res = PyUnicode_New(len, 127);
9647 if (res == NULL)
9648 return NULL;
9649 resdata = PyUnicode_DATA(res);
9650 if (lower)
9651 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 _Py_bytes_upper(resdata, data, len);
9654 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655}
9656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 Py_ssize_t j;
9661 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009662 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009664
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9666
9667 where ! is a negation and \p{xxx} is a character with property xxx.
9668 */
9669 for (j = i - 1; j >= 0; j--) {
9670 c = PyUnicode_READ(kind, data, j);
9671 if (!_PyUnicode_IsCaseIgnorable(c))
9672 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9675 if (final_sigma) {
9676 for (j = i + 1; j < length; j++) {
9677 c = PyUnicode_READ(kind, data, j);
9678 if (!_PyUnicode_IsCaseIgnorable(c))
9679 break;
9680 }
9681 final_sigma = j == length || !_PyUnicode_IsCased(c);
9682 }
9683 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684}
9685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686static int
9687lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9688 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 /* Obscure special case. */
9691 if (c == 0x3A3) {
9692 mapped[0] = handle_capital_sigma(kind, data, length, i);
9693 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696}
9697
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698static Py_ssize_t
9699do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 Py_ssize_t i, k = 0;
9702 int n_res, j;
9703 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009704
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 c = PyUnicode_READ(kind, data, 0);
9706 n_res = _PyUnicode_ToUpperFull(c, mapped);
9707 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009708 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 for (i = 1; i < length; i++) {
9712 c = PyUnicode_READ(kind, data, i);
9713 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9714 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009715 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009717 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009718 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720}
9721
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722static Py_ssize_t
9723do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9724 Py_ssize_t i, k = 0;
9725
9726 for (i = 0; i < length; i++) {
9727 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9728 int n_res, j;
9729 if (Py_UNICODE_ISUPPER(c)) {
9730 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9731 }
9732 else if (Py_UNICODE_ISLOWER(c)) {
9733 n_res = _PyUnicode_ToUpperFull(c, mapped);
9734 }
9735 else {
9736 n_res = 1;
9737 mapped[0] = c;
9738 }
9739 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009740 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 res[k++] = mapped[j];
9742 }
9743 }
9744 return k;
9745}
9746
9747static Py_ssize_t
9748do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9749 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 Py_ssize_t i, k = 0;
9752
9753 for (i = 0; i < length; i++) {
9754 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9755 int n_res, j;
9756 if (lower)
9757 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9758 else
9759 n_res = _PyUnicode_ToUpperFull(c, mapped);
9760 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009761 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
9763 }
9764 }
9765 return k;
9766}
9767
9768static Py_ssize_t
9769do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9770{
9771 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9772}
9773
9774static Py_ssize_t
9775do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9776{
9777 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9778}
9779
Benjamin Petersone51757f2012-01-12 21:10:29 -05009780static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009781do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9782{
9783 Py_ssize_t i, k = 0;
9784
9785 for (i = 0; i < length; i++) {
9786 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9787 Py_UCS4 mapped[3];
9788 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9789 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009790 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009791 res[k++] = mapped[j];
9792 }
9793 }
9794 return k;
9795}
9796
9797static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009798do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9799{
9800 Py_ssize_t i, k = 0;
9801 int previous_is_cased;
9802
9803 previous_is_cased = 0;
9804 for (i = 0; i < length; i++) {
9805 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9806 Py_UCS4 mapped[3];
9807 int n_res, j;
9808
9809 if (previous_is_cased)
9810 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9811 else
9812 n_res = _PyUnicode_ToTitleFull(c, mapped);
9813
9814 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009815 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009816 res[k++] = mapped[j];
9817 }
9818
9819 previous_is_cased = _PyUnicode_IsCased(c);
9820 }
9821 return k;
9822}
9823
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824static PyObject *
9825case_operation(PyObject *self,
9826 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9827{
9828 PyObject *res = NULL;
9829 Py_ssize_t length, newlength = 0;
9830 int kind, outkind;
9831 void *data, *outdata;
9832 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9833
Benjamin Petersoneea48462012-01-16 14:28:50 -05009834 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009835
9836 kind = PyUnicode_KIND(self);
9837 data = PyUnicode_DATA(self);
9838 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009839 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009840 PyErr_SetString(PyExc_OverflowError, "string is too long");
9841 return NULL;
9842 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009843 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844 if (tmp == NULL)
9845 return PyErr_NoMemory();
9846 newlength = perform(kind, data, length, tmp, &maxchar);
9847 res = PyUnicode_New(newlength, maxchar);
9848 if (res == NULL)
9849 goto leave;
9850 tmpend = tmp + newlength;
9851 outdata = PyUnicode_DATA(res);
9852 outkind = PyUnicode_KIND(res);
9853 switch (outkind) {
9854 case PyUnicode_1BYTE_KIND:
9855 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9856 break;
9857 case PyUnicode_2BYTE_KIND:
9858 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9859 break;
9860 case PyUnicode_4BYTE_KIND:
9861 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9862 break;
9863 default:
9864 assert(0);
9865 break;
9866 }
9867 leave:
9868 PyMem_FREE(tmp);
9869 return res;
9870}
9871
Tim Peters8ce9f162004-08-27 01:49:32 +00009872PyObject *
9873PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009875 PyObject *res;
9876 PyObject *fseq;
9877 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009878 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009880 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009881 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009882 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009883 }
9884
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009885 /* NOTE: the following code can't call back into Python code,
9886 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009887 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009888
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009891 res = _PyUnicode_JoinArray(separator, items, seqlen);
9892 Py_DECREF(fseq);
9893 return res;
9894}
9895
9896PyObject *
9897_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9898{
9899 PyObject *res = NULL; /* the result */
9900 PyObject *sep = NULL;
9901 Py_ssize_t seplen;
9902 PyObject *item;
9903 Py_ssize_t sz, i, res_offset;
9904 Py_UCS4 maxchar;
9905 Py_UCS4 item_maxchar;
9906 int use_memcpy;
9907 unsigned char *res_data = NULL, *sep_data = NULL;
9908 PyObject *last_obj;
9909 unsigned int kind = 0;
9910
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 /* If empty sequence, return u"". */
9912 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009913 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009915
Tim Peters05eba1f2004-08-27 21:32:02 +00009916 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009917 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009918 if (seqlen == 1) {
9919 if (PyUnicode_CheckExact(items[0])) {
9920 res = items[0];
9921 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009922 return res;
9923 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009924 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009925 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009926 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009927 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009928 /* Set up sep and seplen */
9929 if (separator == NULL) {
9930 /* fall back to a blank space separator */
9931 sep = PyUnicode_FromOrdinal(' ');
9932 if (!sep)
9933 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009934 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009936 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 else {
9938 if (!PyUnicode_Check(separator)) {
9939 PyErr_Format(PyExc_TypeError,
9940 "separator: expected str instance,"
9941 " %.80s found",
9942 Py_TYPE(separator)->tp_name);
9943 goto onError;
9944 }
9945 if (PyUnicode_READY(separator))
9946 goto onError;
9947 sep = separator;
9948 seplen = PyUnicode_GET_LENGTH(separator);
9949 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9950 /* inc refcount to keep this code path symmetric with the
9951 above case of a blank separator */
9952 Py_INCREF(sep);
9953 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009954 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009955 }
9956
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009957 /* There are at least two things to join, or else we have a subclass
9958 * of str in the sequence.
9959 * Do a pre-pass to figure out the total amount of space we'll
9960 * need (sz), and see whether all argument are strings.
9961 */
9962 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009963#ifdef Py_DEBUG
9964 use_memcpy = 0;
9965#else
9966 use_memcpy = 1;
9967#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009969 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009970 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 if (!PyUnicode_Check(item)) {
9972 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009973 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009974 " %.80s found",
9975 i, Py_TYPE(item)->tp_name);
9976 goto onError;
9977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 if (PyUnicode_READY(item) == -1)
9979 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009980 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009982 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009983 if (i != 0) {
9984 add_sz += seplen;
9985 }
9986 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009989 goto onError;
9990 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009991 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009992 if (use_memcpy && last_obj != NULL) {
9993 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9994 use_memcpy = 0;
9995 }
9996 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009997 }
Tim Petersced69f82003-09-16 20:30:58 +00009998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 if (res == NULL)
10001 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010002
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010004#ifdef Py_DEBUG
10005 use_memcpy = 0;
10006#else
10007 if (use_memcpy) {
10008 res_data = PyUnicode_1BYTE_DATA(res);
10009 kind = PyUnicode_KIND(res);
10010 if (seplen != 0)
10011 sep_data = PyUnicode_1BYTE_DATA(sep);
10012 }
10013#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010014 if (use_memcpy) {
10015 for (i = 0; i < seqlen; ++i) {
10016 Py_ssize_t itemlen;
10017 item = items[i];
10018
10019 /* Copy item, and maybe the separator. */
10020 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010021 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010022 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010023 kind * seplen);
10024 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010025 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010026
10027 itemlen = PyUnicode_GET_LENGTH(item);
10028 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010029 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 kind * itemlen);
10032 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010034 }
10035 assert(res_data == PyUnicode_1BYTE_DATA(res)
10036 + kind * PyUnicode_GET_LENGTH(res));
10037 }
10038 else {
10039 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10040 Py_ssize_t itemlen;
10041 item = items[i];
10042
10043 /* Copy item, and maybe the separator. */
10044 if (i && seplen != 0) {
10045 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10046 res_offset += seplen;
10047 }
10048
10049 itemlen = PyUnicode_GET_LENGTH(item);
10050 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010051 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010052 res_offset += itemlen;
10053 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010054 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010055 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010056 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010059 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010064 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 return NULL;
10066}
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068#define FILL(kind, data, value, start, length) \
10069 do { \
10070 Py_ssize_t i_ = 0; \
10071 assert(kind != PyUnicode_WCHAR_KIND); \
10072 switch ((kind)) { \
10073 case PyUnicode_1BYTE_KIND: { \
10074 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010075 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 break; \
10077 } \
10078 case PyUnicode_2BYTE_KIND: { \
10079 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10080 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10081 break; \
10082 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010083 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10085 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10086 break; \
10087 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010088 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 } \
10090 } while (0)
10091
Victor Stinnerd3f08822012-05-29 12:57:52 +020010092void
10093_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10094 Py_UCS4 fill_char)
10095{
10096 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10097 const void *data = PyUnicode_DATA(unicode);
10098 assert(PyUnicode_IS_READY(unicode));
10099 assert(unicode_modifiable(unicode));
10100 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10101 assert(start >= 0);
10102 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10103 FILL(kind, data, fill_char, start, length);
10104}
10105
Victor Stinner3fe55312012-01-04 00:33:50 +010010106Py_ssize_t
10107PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10108 Py_UCS4 fill_char)
10109{
10110 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010111
10112 if (!PyUnicode_Check(unicode)) {
10113 PyErr_BadInternalCall();
10114 return -1;
10115 }
10116 if (PyUnicode_READY(unicode) == -1)
10117 return -1;
10118 if (unicode_check_modifiable(unicode))
10119 return -1;
10120
Victor Stinnerd3f08822012-05-29 12:57:52 +020010121 if (start < 0) {
10122 PyErr_SetString(PyExc_IndexError, "string index out of range");
10123 return -1;
10124 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010125 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10126 PyErr_SetString(PyExc_ValueError,
10127 "fill character is bigger than "
10128 "the string maximum character");
10129 return -1;
10130 }
10131
10132 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10133 length = Py_MIN(maxlen, length);
10134 if (length <= 0)
10135 return 0;
10136
Victor Stinnerd3f08822012-05-29 12:57:52 +020010137 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010138 return length;
10139}
10140
Victor Stinner9310abb2011-10-05 00:59:23 +020010141static PyObject *
10142pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010143 Py_ssize_t left,
10144 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyObject *u;
10148 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010149 int kind;
10150 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
10152 if (left < 0)
10153 left = 0;
10154 if (right < 0)
10155 right = 0;
10156
Victor Stinnerc4b49542011-12-11 22:44:26 +010010157 if (left == 0 && right == 0)
10158 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10161 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010162 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10163 return NULL;
10164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010166 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010168 if (!u)
10169 return NULL;
10170
10171 kind = PyUnicode_KIND(u);
10172 data = PyUnicode_DATA(u);
10173 if (left)
10174 FILL(kind, data, fill, 0, left);
10175 if (right)
10176 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010177 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010178 assert(_PyUnicode_CheckConsistency(u, 1));
10179 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180}
10181
Alexander Belopolsky40018472011-02-26 01:02:56 +000010182PyObject *
10183PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010187 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
Benjamin Petersonead6b532011-12-20 17:23:42 -060010190 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 if (PyUnicode_IS_ASCII(string))
10193 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 PyUnicode_GET_LENGTH(string), keepends);
10196 else
10197 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 break;
10201 case PyUnicode_2BYTE_KIND:
10202 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 break;
10206 case PyUnicode_4BYTE_KIND:
10207 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 PyUnicode_GET_LENGTH(string), keepends);
10210 break;
10211 default:
10212 assert(0);
10213 list = 0;
10214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216}
10217
Alexander Belopolsky40018472011-02-26 01:02:56 +000010218static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010219split(PyObject *self,
10220 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010221 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010223 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 void *buf1, *buf2;
10225 Py_ssize_t len1, len2;
10226 PyObject* out;
10227
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010229 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (PyUnicode_READY(self) == -1)
10232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010235 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 if (PyUnicode_IS_ASCII(self))
10238 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
10242 else
10243 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 case PyUnicode_2BYTE_KIND:
10248 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 case PyUnicode_4BYTE_KIND:
10253 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 PyUnicode_GET_LENGTH(self), maxcount
10256 );
10257 default:
10258 assert(0);
10259 return NULL;
10260 }
10261
10262 if (PyUnicode_READY(substring) == -1)
10263 return NULL;
10264
10265 kind1 = PyUnicode_KIND(self);
10266 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 len1 = PyUnicode_GET_LENGTH(self);
10268 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010269 if (kind1 < kind2 || len1 < len2) {
10270 out = PyList_New(1);
10271 if (out == NULL)
10272 return NULL;
10273 Py_INCREF(self);
10274 PyList_SET_ITEM(out, 0, self);
10275 return out;
10276 }
10277 buf1 = PyUnicode_DATA(self);
10278 buf2 = PyUnicode_DATA(substring);
10279 if (kind2 != kind1) {
10280 buf2 = _PyUnicode_AsKind(substring, kind1);
10281 if (!buf2)
10282 return NULL;
10283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010285 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10288 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010290 else
10291 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 break;
10294 case PyUnicode_2BYTE_KIND:
10295 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 break;
10298 case PyUnicode_4BYTE_KIND:
10299 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 break;
10302 default:
10303 out = NULL;
10304 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010305 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 PyMem_Free(buf2);
10307 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308}
10309
Alexander Belopolsky40018472011-02-26 01:02:56 +000010310static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010311rsplit(PyObject *self,
10312 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010313 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010314{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010315 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 void *buf1, *buf2;
10317 Py_ssize_t len1, len2;
10318 PyObject* out;
10319
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010321 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010327 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 if (PyUnicode_IS_ASCII(self))
10330 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
10334 else
10335 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 case PyUnicode_2BYTE_KIND:
10340 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 case PyUnicode_4BYTE_KIND:
10345 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 PyUnicode_GET_LENGTH(self), maxcount
10348 );
10349 default:
10350 assert(0);
10351 return NULL;
10352 }
10353
10354 if (PyUnicode_READY(substring) == -1)
10355 return NULL;
10356
10357 kind1 = PyUnicode_KIND(self);
10358 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 len1 = PyUnicode_GET_LENGTH(self);
10360 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010361 if (kind1 < kind2 || len1 < len2) {
10362 out = PyList_New(1);
10363 if (out == NULL)
10364 return NULL;
10365 Py_INCREF(self);
10366 PyList_SET_ITEM(out, 0, self);
10367 return out;
10368 }
10369 buf1 = PyUnicode_DATA(self);
10370 buf2 = PyUnicode_DATA(substring);
10371 if (kind2 != kind1) {
10372 buf2 = _PyUnicode_AsKind(substring, kind1);
10373 if (!buf2)
10374 return NULL;
10375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010377 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010379 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10380 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 else
10383 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 case PyUnicode_2BYTE_KIND:
10387 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 break;
10390 case PyUnicode_4BYTE_KIND:
10391 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 break;
10394 default:
10395 out = NULL;
10396 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010397 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 PyMem_Free(buf2);
10399 return out;
10400}
10401
10402static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10404 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010406 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010408 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10409 return asciilib_find(buf1, len1, buf2, len2, offset);
10410 else
10411 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 case PyUnicode_2BYTE_KIND:
10413 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10414 case PyUnicode_4BYTE_KIND:
10415 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10416 }
10417 assert(0);
10418 return -1;
10419}
10420
10421static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10423 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010425 switch (kind) {
10426 case PyUnicode_1BYTE_KIND:
10427 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10428 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10429 else
10430 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10431 case PyUnicode_2BYTE_KIND:
10432 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10433 case PyUnicode_4BYTE_KIND:
10434 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10435 }
10436 assert(0);
10437 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010438}
10439
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010440static void
10441replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10442 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10443{
10444 int kind = PyUnicode_KIND(u);
10445 void *data = PyUnicode_DATA(u);
10446 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10447 if (kind == PyUnicode_1BYTE_KIND) {
10448 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10449 (Py_UCS1 *)data + len,
10450 u1, u2, maxcount);
10451 }
10452 else if (kind == PyUnicode_2BYTE_KIND) {
10453 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10454 (Py_UCS2 *)data + len,
10455 u1, u2, maxcount);
10456 }
10457 else {
10458 assert(kind == PyUnicode_4BYTE_KIND);
10459 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10460 (Py_UCS4 *)data + len,
10461 u1, u2, maxcount);
10462 }
10463}
10464
Alexander Belopolsky40018472011-02-26 01:02:56 +000010465static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466replace(PyObject *self, PyObject *str1,
10467 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 PyObject *u;
10470 char *sbuf = PyUnicode_DATA(self);
10471 char *buf1 = PyUnicode_DATA(str1);
10472 char *buf2 = PyUnicode_DATA(str2);
10473 int srelease = 0, release1 = 0, release2 = 0;
10474 int skind = PyUnicode_KIND(self);
10475 int kind1 = PyUnicode_KIND(str1);
10476 int kind2 = PyUnicode_KIND(str2);
10477 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10478 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10479 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010480 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010481 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010486 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
Victor Stinner59de0ee2011-10-07 10:01:28 +020010488 if (str1 == str2)
10489 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490
Victor Stinner49a0a212011-10-12 23:46:10 +020010491 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010492 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10493 if (maxchar < maxchar_str1)
10494 /* substring too wide to be present */
10495 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10497 /* Replacing str1 with str2 may cause a maxchar reduction in the
10498 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010499 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010500 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010505 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010507 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010510
Victor Stinner69ed0f42013-04-09 21:48:24 +020010511 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010512 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010513 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010514 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010515 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010520 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10521 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010522 }
10523 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 int rkind = skind;
10525 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010526 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (kind1 < rkind) {
10529 /* widen substring */
10530 buf1 = _PyUnicode_AsKind(str1, rkind);
10531 if (!buf1) goto error;
10532 release1 = 1;
10533 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010534 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (i < 0)
10536 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (rkind > kind2) {
10538 /* widen replacement */
10539 buf2 = _PyUnicode_AsKind(str2, rkind);
10540 if (!buf2) goto error;
10541 release2 = 1;
10542 }
10543 else if (rkind < kind2) {
10544 /* widen self and buf1 */
10545 rkind = kind2;
10546 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010547 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 sbuf = _PyUnicode_AsKind(self, rkind);
10549 if (!sbuf) goto error;
10550 srelease = 1;
10551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010555 u = PyUnicode_New(slen, maxchar);
10556 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010558 assert(PyUnicode_KIND(u) == rkind);
10559 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010560
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010562 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010563 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010565 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010567
10568 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010569 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010571 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010572 if (i == -1)
10573 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010576 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 }
10581 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010583 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 int rkind = skind;
10585 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 buf1 = _PyUnicode_AsKind(str1, rkind);
10590 if (!buf1) goto error;
10591 release1 = 1;
10592 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010593 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 if (n == 0)
10595 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf2 = _PyUnicode_AsKind(str2, rkind);
10599 if (!buf2) goto error;
10600 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 rkind = kind2;
10605 sbuf = _PyUnicode_AsKind(self, rkind);
10606 if (!sbuf) goto error;
10607 srelease = 1;
10608 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010609 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf1 = _PyUnicode_AsKind(str1, rkind);
10611 if (!buf1) goto error;
10612 release1 = 1;
10613 }
10614 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10615 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010616 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 PyErr_SetString(PyExc_OverflowError,
10618 "replace string is too long");
10619 goto error;
10620 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010621 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010623 _Py_INCREF_UNICODE_EMPTY();
10624 if (!unicode_empty)
10625 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 u = unicode_empty;
10627 goto done;
10628 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010629 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 PyErr_SetString(PyExc_OverflowError,
10631 "replace string is too long");
10632 goto error;
10633 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 u = PyUnicode_New(new_size, maxchar);
10635 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010637 assert(PyUnicode_KIND(u) == rkind);
10638 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 ires = i = 0;
10640 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 while (n-- > 0) {
10642 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010643 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010645 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010646 if (j == -1)
10647 break;
10648 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 memcpy(res + rkind * ires,
10651 sbuf + rkind * i,
10652 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 }
10655 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 }
10670 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 /* interleave */
10672 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 if (--n <= 0)
10678 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010679 memcpy(res + rkind * ires,
10680 sbuf + rkind * i,
10681 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 ires++;
10683 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 memcpy(res + rkind * ires,
10686 sbuf + rkind * i,
10687 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010689 }
10690
10691 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010692 unicode_adjust_maxchar(&u);
10693 if (u == NULL)
10694 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010696
10697 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (srelease)
10699 PyMem_FREE(sbuf);
10700 if (release1)
10701 PyMem_FREE(buf1);
10702 if (release2)
10703 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010704 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (srelease)
10710 PyMem_FREE(sbuf);
10711 if (release1)
10712 PyMem_FREE(buf1);
10713 if (release2)
10714 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010715 return unicode_result_unchanged(self);
10716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 error:
10718 if (srelease && sbuf)
10719 PyMem_FREE(sbuf);
10720 if (release1 && buf1)
10721 PyMem_FREE(buf1);
10722 if (release2 && buf2)
10723 PyMem_FREE(buf2);
10724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725}
10726
10727/* --- Unicode Object Methods --------------------------------------------- */
10728
INADA Naoki3ae20562017-01-16 20:41:20 +090010729/*[clinic input]
10730str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
INADA Naoki3ae20562017-01-16 20:41:20 +090010732Return a version of the string where each word is titlecased.
10733
10734More specifically, words start with uppercased characters and all remaining
10735cased characters have lower case.
10736[clinic start generated code]*/
10737
10738static PyObject *
10739unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010740/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010742 if (PyUnicode_READY(self) == -1)
10743 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010744 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745}
10746
INADA Naoki3ae20562017-01-16 20:41:20 +090010747/*[clinic input]
10748str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
INADA Naoki3ae20562017-01-16 20:41:20 +090010750Return a capitalized version of the string.
10751
10752More specifically, make the first character have upper case and the rest lower
10753case.
10754[clinic start generated code]*/
10755
10756static PyObject *
10757unicode_capitalize_impl(PyObject *self)
10758/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010760 if (PyUnicode_READY(self) == -1)
10761 return NULL;
10762 if (PyUnicode_GET_LENGTH(self) == 0)
10763 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010764 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765}
10766
INADA Naoki3ae20562017-01-16 20:41:20 +090010767/*[clinic input]
10768str.casefold as unicode_casefold
10769
10770Return a version of the string suitable for caseless comparisons.
10771[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010772
10773static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010774unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010775/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010776{
10777 if (PyUnicode_READY(self) == -1)
10778 return NULL;
10779 if (PyUnicode_IS_ASCII(self))
10780 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010781 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010782}
10783
10784
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010785/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786
10787static int
10788convert_uc(PyObject *obj, void *addr)
10789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010791
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010792 if (!PyUnicode_Check(obj)) {
10793 PyErr_Format(PyExc_TypeError,
10794 "The fill character must be a unicode character, "
10795 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010796 return 0;
10797 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010798 if (PyUnicode_READY(obj) < 0)
10799 return 0;
10800 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010802 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 return 0;
10804 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010805 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010806 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807}
10808
INADA Naoki3ae20562017-01-16 20:41:20 +090010809/*[clinic input]
10810str.center as unicode_center
10811
10812 width: Py_ssize_t
10813 fillchar: Py_UCS4 = ' '
10814 /
10815
10816Return a centered string of length width.
10817
10818Padding is done using the specified fill character (default is a space).
10819[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010822unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10823/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010825 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Benjamin Petersonbac79492012-01-14 13:34:47 -050010827 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828 return NULL;
10829
Victor Stinnerc4b49542011-12-11 22:44:26 +010010830 if (PyUnicode_GET_LENGTH(self) >= width)
10831 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
Victor Stinnerc4b49542011-12-11 22:44:26 +010010833 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 left = marg / 2 + (marg & width & 1);
10835
Victor Stinner9310abb2011-10-05 00:59:23 +020010836 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837}
10838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839/* This function assumes that str1 and str2 are readied by the caller. */
10840
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010842unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010843{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010844#define COMPARE(TYPE1, TYPE2) \
10845 do { \
10846 TYPE1* p1 = (TYPE1 *)data1; \
10847 TYPE2* p2 = (TYPE2 *)data2; \
10848 TYPE1* end = p1 + len; \
10849 Py_UCS4 c1, c2; \
10850 for (; p1 != end; p1++, p2++) { \
10851 c1 = *p1; \
10852 c2 = *p2; \
10853 if (c1 != c2) \
10854 return (c1 < c2) ? -1 : 1; \
10855 } \
10856 } \
10857 while (0)
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 int kind1, kind2;
10860 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 kind1 = PyUnicode_KIND(str1);
10864 kind2 = PyUnicode_KIND(str2);
10865 data1 = PyUnicode_DATA(str1);
10866 data2 = PyUnicode_DATA(str2);
10867 len1 = PyUnicode_GET_LENGTH(str1);
10868 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010869 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010870
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 switch(kind1) {
10872 case PyUnicode_1BYTE_KIND:
10873 {
10874 switch(kind2) {
10875 case PyUnicode_1BYTE_KIND:
10876 {
10877 int cmp = memcmp(data1, data2, len);
10878 /* normalize result of memcmp() into the range [-1; 1] */
10879 if (cmp < 0)
10880 return -1;
10881 if (cmp > 0)
10882 return 1;
10883 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010884 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 case PyUnicode_2BYTE_KIND:
10886 COMPARE(Py_UCS1, Py_UCS2);
10887 break;
10888 case PyUnicode_4BYTE_KIND:
10889 COMPARE(Py_UCS1, Py_UCS4);
10890 break;
10891 default:
10892 assert(0);
10893 }
10894 break;
10895 }
10896 case PyUnicode_2BYTE_KIND:
10897 {
10898 switch(kind2) {
10899 case PyUnicode_1BYTE_KIND:
10900 COMPARE(Py_UCS2, Py_UCS1);
10901 break;
10902 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010903 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010904 COMPARE(Py_UCS2, Py_UCS2);
10905 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010906 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010907 case PyUnicode_4BYTE_KIND:
10908 COMPARE(Py_UCS2, Py_UCS4);
10909 break;
10910 default:
10911 assert(0);
10912 }
10913 break;
10914 }
10915 case PyUnicode_4BYTE_KIND:
10916 {
10917 switch(kind2) {
10918 case PyUnicode_1BYTE_KIND:
10919 COMPARE(Py_UCS4, Py_UCS1);
10920 break;
10921 case PyUnicode_2BYTE_KIND:
10922 COMPARE(Py_UCS4, Py_UCS2);
10923 break;
10924 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010925 {
10926#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10927 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10928 /* normalize result of wmemcmp() into the range [-1; 1] */
10929 if (cmp < 0)
10930 return -1;
10931 if (cmp > 0)
10932 return 1;
10933#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010937 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010938 default:
10939 assert(0);
10940 }
10941 break;
10942 }
10943 default:
10944 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010945 }
10946
Victor Stinner770e19e2012-10-04 22:59:45 +020010947 if (len1 == len2)
10948 return 0;
10949 if (len1 < len2)
10950 return -1;
10951 else
10952 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953
10954#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010955}
10956
Benjamin Peterson621b4302016-09-09 13:54:34 -070010957static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010958unicode_compare_eq(PyObject *str1, PyObject *str2)
10959{
10960 int kind;
10961 void *data1, *data2;
10962 Py_ssize_t len;
10963 int cmp;
10964
Victor Stinnere5567ad2012-10-23 02:48:49 +020010965 len = PyUnicode_GET_LENGTH(str1);
10966 if (PyUnicode_GET_LENGTH(str2) != len)
10967 return 0;
10968 kind = PyUnicode_KIND(str1);
10969 if (PyUnicode_KIND(str2) != kind)
10970 return 0;
10971 data1 = PyUnicode_DATA(str1);
10972 data2 = PyUnicode_DATA(str2);
10973
10974 cmp = memcmp(data1, data2, len * kind);
10975 return (cmp == 0);
10976}
10977
10978
Alexander Belopolsky40018472011-02-26 01:02:56 +000010979int
10980PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10983 if (PyUnicode_READY(left) == -1 ||
10984 PyUnicode_READY(right) == -1)
10985 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010986
10987 /* a string is equal to itself */
10988 if (left == right)
10989 return 0;
10990
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010991 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010993 PyErr_Format(PyExc_TypeError,
10994 "Can't compare %.100s and %.100s",
10995 left->ob_type->tp_name,
10996 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 return -1;
10998}
10999
Martin v. Löwis5b222132007-06-10 09:51:05 +000011000int
11001PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 Py_ssize_t i;
11004 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011006 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007
Victor Stinner910337b2011-10-03 03:20:16 +020011008 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011009 if (!PyUnicode_IS_READY(uni)) {
11010 const wchar_t *ws = _PyUnicode_WSTR(uni);
11011 /* Compare Unicode string and source character set string */
11012 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11013 if (chr != ustr[i])
11014 return (chr < ustr[i]) ? -1 : 1;
11015 }
11016 /* This check keeps Python strings that end in '\0' from comparing equal
11017 to C strings identical up to that point. */
11018 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11019 return 1; /* uni is longer */
11020 if (ustr[i])
11021 return -1; /* str is longer */
11022 return 0;
11023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011025 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011026 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011027 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011028 size_t len, len2 = strlen(str);
11029 int cmp;
11030
11031 len = Py_MIN(len1, len2);
11032 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011033 if (cmp != 0) {
11034 if (cmp < 0)
11035 return -1;
11036 else
11037 return 1;
11038 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011039 if (len1 > len2)
11040 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011041 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011042 return -1; /* str is longer */
11043 return 0;
11044 }
11045 else {
11046 void *data = PyUnicode_DATA(uni);
11047 /* Compare Unicode string and source character set string */
11048 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011049 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011050 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11051 /* This check keeps Python strings that end in '\0' from comparing equal
11052 to C strings identical up to that point. */
11053 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11054 return 1; /* uni is longer */
11055 if (str[i])
11056 return -1; /* str is longer */
11057 return 0;
11058 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011059}
11060
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011061static int
11062non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11063{
11064 size_t i, len;
11065 const wchar_t *p;
11066 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11067 if (strlen(str) != len)
11068 return 0;
11069 p = _PyUnicode_WSTR(unicode);
11070 assert(p);
11071 for (i = 0; i < len; i++) {
11072 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011073 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011074 return 0;
11075 }
11076 return 1;
11077}
11078
11079int
11080_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11081{
11082 size_t len;
11083 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011084 assert(str);
11085#ifndef NDEBUG
11086 for (const char *p = str; *p; p++) {
11087 assert((unsigned char)*p < 128);
11088 }
11089#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011090 if (PyUnicode_READY(unicode) == -1) {
11091 /* Memory error or bad data */
11092 PyErr_Clear();
11093 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11094 }
11095 if (!PyUnicode_IS_ASCII(unicode))
11096 return 0;
11097 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11098 return strlen(str) == len &&
11099 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11100}
11101
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011102int
11103_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11104{
11105 PyObject *right_uni;
11106 Py_hash_t hash;
11107
11108 assert(_PyUnicode_CHECK(left));
11109 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011110#ifndef NDEBUG
11111 for (const char *p = right->string; *p; p++) {
11112 assert((unsigned char)*p < 128);
11113 }
11114#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011115
11116 if (PyUnicode_READY(left) == -1) {
11117 /* memory error or bad data */
11118 PyErr_Clear();
11119 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11120 }
11121
11122 if (!PyUnicode_IS_ASCII(left))
11123 return 0;
11124
11125 right_uni = _PyUnicode_FromId(right); /* borrowed */
11126 if (right_uni == NULL) {
11127 /* memory error or bad data */
11128 PyErr_Clear();
11129 return _PyUnicode_EqualToASCIIString(left, right->string);
11130 }
11131
11132 if (left == right_uni)
11133 return 1;
11134
11135 if (PyUnicode_CHECK_INTERNED(left))
11136 return 0;
11137
11138 assert(_PyUnicode_HASH(right_uni) != 1);
11139 hash = _PyUnicode_HASH(left);
11140 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11141 return 0;
11142
11143 return unicode_compare_eq(left, right_uni);
11144}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011145
Benjamin Peterson29060642009-01-31 22:14:21 +000011146#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011147 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011148
Alexander Belopolsky40018472011-02-26 01:02:56 +000011149PyObject *
11150PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011151{
11152 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011153 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011154
Victor Stinnere5567ad2012-10-23 02:48:49 +020011155 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11156 Py_RETURN_NOTIMPLEMENTED;
11157
11158 if (PyUnicode_READY(left) == -1 ||
11159 PyUnicode_READY(right) == -1)
11160 return NULL;
11161
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011162 if (left == right) {
11163 switch (op) {
11164 case Py_EQ:
11165 case Py_LE:
11166 case Py_GE:
11167 /* a string is equal to itself */
11168 v = Py_True;
11169 break;
11170 case Py_NE:
11171 case Py_LT:
11172 case Py_GT:
11173 v = Py_False;
11174 break;
11175 default:
11176 PyErr_BadArgument();
11177 return NULL;
11178 }
11179 }
11180 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011181 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011182 result ^= (op == Py_NE);
11183 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011184 }
11185 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011186 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011187
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011188 /* Convert the return value to a Boolean */
11189 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011190 case Py_LE:
11191 v = TEST_COND(result <= 0);
11192 break;
11193 case Py_GE:
11194 v = TEST_COND(result >= 0);
11195 break;
11196 case Py_LT:
11197 v = TEST_COND(result == -1);
11198 break;
11199 case Py_GT:
11200 v = TEST_COND(result == 1);
11201 break;
11202 default:
11203 PyErr_BadArgument();
11204 return NULL;
11205 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011206 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011207 Py_INCREF(v);
11208 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011209}
11210
Alexander Belopolsky40018472011-02-26 01:02:56 +000011211int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011212_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11213{
11214 return unicode_eq(aa, bb);
11215}
11216
11217int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011218PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011219{
Victor Stinner77282cb2013-04-14 19:22:47 +020011220 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 void *buf1, *buf2;
11222 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011223 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011224
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 "'in <string>' requires string as left operand, not %.100s",
11228 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011229 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011230 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011232 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233 if (ensure_unicode(str) < 0)
11234 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011237 kind2 = PyUnicode_KIND(substr);
11238 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011241 len2 = PyUnicode_GET_LENGTH(substr);
11242 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011243 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011245 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011246 if (len2 == 1) {
11247 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11248 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 return result;
11250 }
11251 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011252 buf2 = _PyUnicode_AsKind(substr, kind1);
11253 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011254 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256
Victor Stinner77282cb2013-04-14 19:22:47 +020011257 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 case PyUnicode_1BYTE_KIND:
11259 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11260 break;
11261 case PyUnicode_2BYTE_KIND:
11262 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11263 break;
11264 case PyUnicode_4BYTE_KIND:
11265 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11266 break;
11267 default:
11268 result = -1;
11269 assert(0);
11270 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011271
Victor Stinner77282cb2013-04-14 19:22:47 +020011272 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 PyMem_Free(buf2);
11274
Guido van Rossum403d68b2000-03-13 15:55:09 +000011275 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011276}
11277
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278/* Concat to string or Unicode object giving a new Unicode object. */
11279
Alexander Belopolsky40018472011-02-26 01:02:56 +000011280PyObject *
11281PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011284 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011285 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011287 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
11290 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 if (left == unicode_empty)
11292 return PyUnicode_FromObject(right);
11293 if (right == unicode_empty)
11294 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 left_len = PyUnicode_GET_LENGTH(left);
11297 right_len = PyUnicode_GET_LENGTH(right);
11298 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011299 PyErr_SetString(PyExc_OverflowError,
11300 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011302 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011304
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11306 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011307 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 result = PyUnicode_New(new_len, maxchar);
11311 if (result == NULL)
11312 return NULL;
11313 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11314 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11315 assert(_PyUnicode_CheckConsistency(result, 1));
11316 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Walter Dörwald1ab83302007-05-18 17:15:44 +000011319void
Victor Stinner23e56682011-10-03 03:54:37 +020011320PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011321{
Victor Stinner23e56682011-10-03 03:54:37 +020011322 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011323 Py_UCS4 maxchar, maxchar2;
11324 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011325
11326 if (p_left == NULL) {
11327 if (!PyErr_Occurred())
11328 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 return;
11330 }
Victor Stinner23e56682011-10-03 03:54:37 +020011331 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011332 if (right == NULL || left == NULL
11333 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011334 if (!PyErr_Occurred())
11335 PyErr_BadInternalCall();
11336 goto error;
11337 }
11338
Benjamin Petersonbac79492012-01-14 13:34:47 -050011339 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011340 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011341 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011342 goto error;
11343
Victor Stinner488fa492011-12-12 00:01:39 +010011344 /* Shortcuts */
11345 if (left == unicode_empty) {
11346 Py_DECREF(left);
11347 Py_INCREF(right);
11348 *p_left = right;
11349 return;
11350 }
11351 if (right == unicode_empty)
11352 return;
11353
11354 left_len = PyUnicode_GET_LENGTH(left);
11355 right_len = PyUnicode_GET_LENGTH(right);
11356 if (left_len > PY_SSIZE_T_MAX - right_len) {
11357 PyErr_SetString(PyExc_OverflowError,
11358 "strings are too large to concat");
11359 goto error;
11360 }
11361 new_len = left_len + right_len;
11362
11363 if (unicode_modifiable(left)
11364 && PyUnicode_CheckExact(right)
11365 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011366 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11367 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011368 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011369 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011370 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11371 {
11372 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011373 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011374 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011375
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011376 /* copy 'right' into the newly allocated area of 'left' */
11377 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011378 }
Victor Stinner488fa492011-12-12 00:01:39 +010011379 else {
11380 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11381 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011382 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011383
Victor Stinner488fa492011-12-12 00:01:39 +010011384 /* Concat the two Unicode strings */
11385 res = PyUnicode_New(new_len, maxchar);
11386 if (res == NULL)
11387 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011388 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11389 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011390 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011391 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011392 }
11393 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011394 return;
11395
11396error:
Victor Stinner488fa492011-12-12 00:01:39 +010011397 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011398}
11399
11400void
11401PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11402{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 PyUnicode_Append(pleft, right);
11404 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011405}
11406
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011407/*
11408Wraps stringlib_parse_args_finds() and additionally ensures that the
11409first argument is a unicode object.
11410*/
11411
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011412static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011413parse_args_finds_unicode(const char * function_name, PyObject *args,
11414 PyObject **substring,
11415 Py_ssize_t *start, Py_ssize_t *end)
11416{
11417 if(stringlib_parse_args_finds(function_name, args, substring,
11418 start, end)) {
11419 if (ensure_unicode(*substring) < 0)
11420 return 0;
11421 return 1;
11422 }
11423 return 0;
11424}
11425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011430string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011431interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
11433static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011434unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011436 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011437 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011438 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011440 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 void *buf1, *buf2;
11442 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011444 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 kind1 = PyUnicode_KIND(self);
11448 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011450 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 len1 = PyUnicode_GET_LENGTH(self);
11453 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 buf1 = PyUnicode_DATA(self);
11459 buf2 = PyUnicode_DATA(substring);
11460 if (kind2 != kind1) {
11461 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011462 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011463 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 }
11465 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 case PyUnicode_1BYTE_KIND:
11467 iresult = ucs1lib_count(
11468 ((Py_UCS1*)buf1) + start, end - start,
11469 buf2, len2, PY_SSIZE_T_MAX
11470 );
11471 break;
11472 case PyUnicode_2BYTE_KIND:
11473 iresult = ucs2lib_count(
11474 ((Py_UCS2*)buf1) + start, end - start,
11475 buf2, len2, PY_SSIZE_T_MAX
11476 );
11477 break;
11478 case PyUnicode_4BYTE_KIND:
11479 iresult = ucs4lib_count(
11480 ((Py_UCS4*)buf1) + start, end - start,
11481 buf2, len2, PY_SSIZE_T_MAX
11482 );
11483 break;
11484 default:
11485 assert(0); iresult = 0;
11486 }
11487
11488 result = PyLong_FromSsize_t(iresult);
11489
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011490 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 return result;
11494}
11495
INADA Naoki3ae20562017-01-16 20:41:20 +090011496/*[clinic input]
11497str.encode as unicode_encode
11498
11499 encoding: str(c_default="NULL") = 'utf-8'
11500 The encoding in which to encode the string.
11501 errors: str(c_default="NULL") = 'strict'
11502 The error handling scheme to use for encoding errors.
11503 The default is 'strict' meaning that encoding errors raise a
11504 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11505 'xmlcharrefreplace' as well as any other name registered with
11506 codecs.register_error that can handle UnicodeEncodeErrors.
11507
11508Encode the string using the codec registered for encoding.
11509[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011512unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011513/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011515 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011516}
11517
INADA Naoki3ae20562017-01-16 20:41:20 +090011518/*[clinic input]
11519str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
INADA Naoki3ae20562017-01-16 20:41:20 +090011521 tabsize: int = 8
11522
11523Return a copy where all tab characters are expanded using spaces.
11524
11525If tabsize is not given, a tab size of 8 characters is assumed.
11526[clinic start generated code]*/
11527
11528static PyObject *
11529unicode_expandtabs_impl(PyObject *self, int tabsize)
11530/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 Py_ssize_t i, j, line_pos, src_len, incr;
11533 Py_UCS4 ch;
11534 PyObject *u;
11535 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011536 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011537 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
Antoine Pitrou22425222011-10-04 19:10:51 +020011539 if (PyUnicode_READY(self) == -1)
11540 return NULL;
11541
Thomas Wouters7e474022000-07-16 12:04:32 +000011542 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011543 src_len = PyUnicode_GET_LENGTH(self);
11544 i = j = line_pos = 0;
11545 kind = PyUnicode_KIND(self);
11546 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011547 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011548 for (; i < src_len; i++) {
11549 ch = PyUnicode_READ(kind, src_data, i);
11550 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011551 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011555 goto overflow;
11556 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011558 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011562 goto overflow;
11563 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 if (ch == '\n' || ch == '\r')
11566 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011569 if (!found)
11570 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011571
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011573 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 if (!u)
11575 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 for (; i < src_len; i++) {
11581 ch = PyUnicode_READ(kind, src_data, i);
11582 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 incr = tabsize - (line_pos % tabsize);
11585 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011586 FILL(kind, dest_data, ' ', j, incr);
11587 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011589 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 line_pos++;
11592 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011593 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 if (ch == '\n' || ch == '\r')
11595 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 }
11598 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011599 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011602 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011606PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608\n\
11609Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011610such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611arguments start and end are interpreted as in slice notation.\n\
11612\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
11615static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011618 /* initialize variables to prevent gcc warning */
11619 PyObject *substring = NULL;
11620 Py_ssize_t start = 0;
11621 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011624 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011627 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011630 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (result == -2)
11633 return NULL;
11634
Christian Heimes217cfd12007-12-02 14:31:20 +000011635 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
11638static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011639unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011641 void *data;
11642 enum PyUnicode_Kind kind;
11643 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644
11645 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11646 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011648 }
11649 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11650 PyErr_SetString(PyExc_IndexError, "string index out of range");
11651 return NULL;
11652 }
11653 kind = PyUnicode_KIND(self);
11654 data = PyUnicode_DATA(self);
11655 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011656 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
Guido van Rossumc2504932007-09-18 19:42:40 +000011659/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011660 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011661static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663{
Guido van Rossumc2504932007-09-18 19:42:40 +000011664 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011665 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011666
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011667#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011668 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011669#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (_PyUnicode_HASH(self) != -1)
11671 return _PyUnicode_HASH(self);
11672 if (PyUnicode_READY(self) == -1)
11673 return -1;
11674 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011675 /*
11676 We make the hash of the empty string be 0, rather than using
11677 (prefix ^ suffix), since this slightly obfuscates the hash secret
11678 */
11679 if (len == 0) {
11680 _PyUnicode_HASH(self) = 0;
11681 return 0;
11682 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011683 x = _Py_HashBytes(PyUnicode_DATA(self),
11684 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011686 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687}
11688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011692Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693
11694static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011697 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011698 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011699 PyObject *substring = NULL;
11700 Py_ssize_t start = 0;
11701 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011703 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011706 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011709 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (result == -2)
11712 return NULL;
11713
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 if (result < 0) {
11715 PyErr_SetString(PyExc_ValueError, "substring not found");
11716 return NULL;
11717 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011718
Christian Heimes217cfd12007-12-02 14:31:20 +000011719 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720}
11721
INADA Naoki3ae20562017-01-16 20:41:20 +090011722/*[clinic input]
11723str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
INADA Naoki3ae20562017-01-16 20:41:20 +090011725Return True if the string is a lowercase string, False otherwise.
11726
11727A string is lowercase if all cased characters in the string are lowercase and
11728there is at least one cased character in the string.
11729[clinic start generated code]*/
11730
11731static PyObject *
11732unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011733/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 Py_ssize_t i, length;
11736 int kind;
11737 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 int cased;
11739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (PyUnicode_READY(self) == -1)
11741 return NULL;
11742 length = PyUnicode_GET_LENGTH(self);
11743 kind = PyUnicode_KIND(self);
11744 data = PyUnicode_DATA(self);
11745
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 1)
11748 return PyBool_FromLong(
11749 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011751 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011754
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 for (i = 0; i < length; i++) {
11757 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011758
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11760 return PyBool_FromLong(0);
11761 else if (!cased && Py_UNICODE_ISLOWER(ch))
11762 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011764 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
INADA Naoki3ae20562017-01-16 20:41:20 +090011767/*[clinic input]
11768str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
INADA Naoki3ae20562017-01-16 20:41:20 +090011770Return True if the string is an uppercase string, False otherwise.
11771
11772A string is uppercase if all cased characters in the string are uppercase and
11773there is at least one cased character in the string.
11774[clinic start generated code]*/
11775
11776static PyObject *
11777unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011778/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 Py_ssize_t i, length;
11781 int kind;
11782 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 int cased;
11784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (PyUnicode_READY(self) == -1)
11786 return NULL;
11787 length = PyUnicode_GET_LENGTH(self);
11788 kind = PyUnicode_KIND(self);
11789 data = PyUnicode_DATA(self);
11790
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (length == 1)
11793 return PyBool_FromLong(
11794 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011796 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011799
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 for (i = 0; i < length; i++) {
11802 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011803
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11805 return PyBool_FromLong(0);
11806 else if (!cased && Py_UNICODE_ISUPPER(ch))
11807 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011809 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810}
11811
INADA Naoki3ae20562017-01-16 20:41:20 +090011812/*[clinic input]
11813str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
INADA Naoki3ae20562017-01-16 20:41:20 +090011815Return True if the string is a title-cased string, False otherwise.
11816
11817In a title-cased string, upper- and title-case characters may only
11818follow uncased characters and lowercase characters only cased ones.
11819[clinic start generated code]*/
11820
11821static PyObject *
11822unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011823/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 Py_ssize_t i, length;
11826 int kind;
11827 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 int cased, previous_is_cased;
11829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (PyUnicode_READY(self) == -1)
11831 return NULL;
11832 length = PyUnicode_GET_LENGTH(self);
11833 kind = PyUnicode_KIND(self);
11834 data = PyUnicode_DATA(self);
11835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (length == 1) {
11838 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11839 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11840 (Py_UNICODE_ISUPPER(ch) != 0));
11841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011843 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 cased = 0;
11848 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 for (i = 0; i < length; i++) {
11850 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011851
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11853 if (previous_is_cased)
11854 return PyBool_FromLong(0);
11855 previous_is_cased = 1;
11856 cased = 1;
11857 }
11858 else if (Py_UNICODE_ISLOWER(ch)) {
11859 if (!previous_is_cased)
11860 return PyBool_FromLong(0);
11861 previous_is_cased = 1;
11862 cased = 1;
11863 }
11864 else
11865 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011867 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868}
11869
INADA Naoki3ae20562017-01-16 20:41:20 +090011870/*[clinic input]
11871str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872
INADA Naoki3ae20562017-01-16 20:41:20 +090011873Return True if the string is a whitespace string, False otherwise.
11874
11875A string is whitespace if all characters in the string are whitespace and there
11876is at least one character in the string.
11877[clinic start generated code]*/
11878
11879static PyObject *
11880unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011881/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 Py_ssize_t i, length;
11884 int kind;
11885 void *data;
11886
11887 if (PyUnicode_READY(self) == -1)
11888 return NULL;
11889 length = PyUnicode_GET_LENGTH(self);
11890 kind = PyUnicode_KIND(self);
11891 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (length == 1)
11895 return PyBool_FromLong(
11896 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011898 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 for (i = 0; i < length; i++) {
11903 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011904 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908}
11909
INADA Naoki3ae20562017-01-16 20:41:20 +090011910/*[clinic input]
11911str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011912
INADA Naoki3ae20562017-01-16 20:41:20 +090011913Return True if the string is an alphabetic string, False otherwise.
11914
11915A string is alphabetic if all characters in the string are alphabetic and there
11916is at least one character in the string.
11917[clinic start generated code]*/
11918
11919static PyObject *
11920unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011921/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 Py_ssize_t i, length;
11924 int kind;
11925 void *data;
11926
11927 if (PyUnicode_READY(self) == -1)
11928 return NULL;
11929 length = PyUnicode_GET_LENGTH(self);
11930 kind = PyUnicode_KIND(self);
11931 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011932
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011933 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (length == 1)
11935 return PyBool_FromLong(
11936 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937
11938 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 for (i = 0; i < length; i++) {
11943 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011946 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011947}
11948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949/*[clinic input]
11950str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952Return True if the string is an alpha-numeric string, False otherwise.
11953
11954A string is alpha-numeric if all characters in the string are alpha-numeric and
11955there is at least one character in the string.
11956[clinic start generated code]*/
11957
11958static PyObject *
11959unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011960/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 int kind;
11963 void *data;
11964 Py_ssize_t len, i;
11965
11966 if (PyUnicode_READY(self) == -1)
11967 return NULL;
11968
11969 kind = PyUnicode_KIND(self);
11970 data = PyUnicode_DATA(self);
11971 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011972
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (len == 1) {
11975 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11976 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11977 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011978
11979 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 for (i = 0; i < len; i++) {
11984 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011985 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011988 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989}
11990
INADA Naoki3ae20562017-01-16 20:41:20 +090011991/*[clinic input]
11992str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
INADA Naoki3ae20562017-01-16 20:41:20 +090011994Return True if the string is a decimal string, False otherwise.
11995
11996A string is a decimal string if all characters in the string are decimal and
11997there is at least one character in the string.
11998[clinic start generated code]*/
11999
12000static PyObject *
12001unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012002/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 Py_ssize_t i, length;
12005 int kind;
12006 void *data;
12007
12008 if (PyUnicode_READY(self) == -1)
12009 return NULL;
12010 length = PyUnicode_GET_LENGTH(self);
12011 kind = PyUnicode_KIND(self);
12012 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (length == 1)
12016 return PyBool_FromLong(
12017 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012019 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 for (i = 0; i < length; i++) {
12024 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012027 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028}
12029
INADA Naoki3ae20562017-01-16 20:41:20 +090012030/*[clinic input]
12031str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
INADA Naoki3ae20562017-01-16 20:41:20 +090012033Return True if the string is a digit string, False otherwise.
12034
12035A string is a digit string if all characters in the string are digits and there
12036is at least one character in the string.
12037[clinic start generated code]*/
12038
12039static PyObject *
12040unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012041/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 Py_ssize_t i, length;
12044 int kind;
12045 void *data;
12046
12047 if (PyUnicode_READY(self) == -1)
12048 return NULL;
12049 length = PyUnicode_GET_LENGTH(self);
12050 kind = PyUnicode_KIND(self);
12051 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (length == 1) {
12055 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12056 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 for (i = 0; i < length; i++) {
12064 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012067 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068}
12069
INADA Naoki3ae20562017-01-16 20:41:20 +090012070/*[clinic input]
12071str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
INADA Naoki3ae20562017-01-16 20:41:20 +090012073Return True if the string is a numeric string, False otherwise.
12074
12075A string is numeric if all characters in the string are numeric and there is at
12076least one character in the string.
12077[clinic start generated code]*/
12078
12079static PyObject *
12080unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012081/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 Py_ssize_t i, length;
12084 int kind;
12085 void *data;
12086
12087 if (PyUnicode_READY(self) == -1)
12088 return NULL;
12089 length = PyUnicode_GET_LENGTH(self);
12090 kind = PyUnicode_KIND(self);
12091 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 if (length == 1)
12095 return PyBool_FromLong(
12096 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012098 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 for (i = 0; i < length; i++) {
12103 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012106 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107}
12108
Martin v. Löwis47383402007-08-15 07:32:56 +000012109int
12110PyUnicode_IsIdentifier(PyObject *self)
12111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 int kind;
12113 void *data;
12114 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012115 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (PyUnicode_READY(self) == -1) {
12118 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 }
12121
12122 /* Special case for empty strings */
12123 if (PyUnicode_GET_LENGTH(self) == 0)
12124 return 0;
12125 kind = PyUnicode_KIND(self);
12126 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012127
12128 /* PEP 3131 says that the first character must be in
12129 XID_Start and subsequent characters in XID_Continue,
12130 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012131 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012132 letters, digits, underscore). However, given the current
12133 definition of XID_Start and XID_Continue, it is sufficient
12134 to check just for these, except that _ must be allowed
12135 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012137 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012138 return 0;
12139
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012140 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012143 return 1;
12144}
12145
INADA Naoki3ae20562017-01-16 20:41:20 +090012146/*[clinic input]
12147str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012148
INADA Naoki3ae20562017-01-16 20:41:20 +090012149Return True if the string is a valid Python identifier, False otherwise.
12150
12151Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12152"class".
12153[clinic start generated code]*/
12154
12155static PyObject *
12156unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012157/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012158{
12159 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12160}
12161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162/*[clinic input]
12163str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012164
INADA Naoki3ae20562017-01-16 20:41:20 +090012165Return True if the string is printable, False otherwise.
12166
12167A string is printable if all of its characters are considered printable in
12168repr() or if it is empty.
12169[clinic start generated code]*/
12170
12171static PyObject *
12172unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012173/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 Py_ssize_t i, length;
12176 int kind;
12177 void *data;
12178
12179 if (PyUnicode_READY(self) == -1)
12180 return NULL;
12181 length = PyUnicode_GET_LENGTH(self);
12182 kind = PyUnicode_KIND(self);
12183 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012184
12185 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (length == 1)
12187 return PyBool_FromLong(
12188 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 for (i = 0; i < length; i++) {
12191 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012192 Py_RETURN_FALSE;
12193 }
12194 }
12195 Py_RETURN_TRUE;
12196}
12197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198/*[clinic input]
12199str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
INADA Naoki3ae20562017-01-16 20:41:20 +090012201 iterable: object
12202 /
12203
12204Concatenate any number of strings.
12205
12206The string whose method is called is inserted in between each given strings.
12207The result is returned as a new string.
12208
12209Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12210[clinic start generated code]*/
12211
12212static PyObject *
12213unicode_join(PyObject *self, PyObject *iterable)
INADA Naoki15f94592017-01-16 21:49:13 +090012214/*[clinic end generated code: output=6857e7cecfe7bf98 input=d8311e5ccbafbeb6]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
INADA Naoki3ae20562017-01-16 20:41:20 +090012216 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217}
12218
Martin v. Löwis18e16552006-02-15 17:27:45 +000012219static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012220unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (PyUnicode_READY(self) == -1)
12223 return -1;
12224 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225}
12226
INADA Naoki3ae20562017-01-16 20:41:20 +090012227/*[clinic input]
12228str.ljust as unicode_ljust
12229
12230 width: Py_ssize_t
12231 fillchar: Py_UCS4 = ' '
12232 /
12233
12234Return a left-justified string of length width.
12235
12236Padding is done using the specified fill character (default is a space).
12237[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
12239static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012240unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12241/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012243 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245
Victor Stinnerc4b49542011-12-11 22:44:26 +010012246 if (PyUnicode_GET_LENGTH(self) >= width)
12247 return unicode_result_unchanged(self);
12248
12249 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250}
12251
INADA Naoki3ae20562017-01-16 20:41:20 +090012252/*[clinic input]
12253str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
INADA Naoki3ae20562017-01-16 20:41:20 +090012255Return a copy of the string converted to lowercase.
12256[clinic start generated code]*/
12257
12258static PyObject *
12259unicode_lower_impl(PyObject *self)
12260/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012262 if (PyUnicode_READY(self) == -1)
12263 return NULL;
12264 if (PyUnicode_IS_ASCII(self))
12265 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012266 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267}
12268
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012269#define LEFTSTRIP 0
12270#define RIGHTSTRIP 1
12271#define BOTHSTRIP 2
12272
12273/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012274static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275
INADA Naoki3ae20562017-01-16 20:41:20 +090012276#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012278/* externally visible for str.strip(unicode) */
12279PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012280_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 void *data;
12283 int kind;
12284 Py_ssize_t i, j, len;
12285 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012286 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12289 return NULL;
12290
12291 kind = PyUnicode_KIND(self);
12292 data = PyUnicode_DATA(self);
12293 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012294 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12296 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012297 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298
Benjamin Peterson14339b62009-01-31 16:36:08 +000012299 i = 0;
12300 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012301 while (i < len) {
12302 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12303 if (!BLOOM(sepmask, ch))
12304 break;
12305 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12306 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 i++;
12308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 j = len;
12312 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012313 j--;
12314 while (j >= i) {
12315 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12316 if (!BLOOM(sepmask, ch))
12317 break;
12318 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12319 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012321 }
12322
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012325
Victor Stinner7931d9a2011-11-04 00:22:48 +010012326 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327}
12328
12329PyObject*
12330PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12331{
12332 unsigned char *data;
12333 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012334 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335
Victor Stinnerde636f32011-10-01 03:55:54 +020012336 if (PyUnicode_READY(self) == -1)
12337 return NULL;
12338
Victor Stinner684d5fd2012-05-03 02:32:34 +020012339 length = PyUnicode_GET_LENGTH(self);
12340 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012341
Victor Stinner684d5fd2012-05-03 02:32:34 +020012342 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012343 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344
Victor Stinnerde636f32011-10-01 03:55:54 +020012345 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012346 PyErr_SetString(PyExc_IndexError, "string index out of range");
12347 return NULL;
12348 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012349 if (start >= length || end < start)
12350 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012351
Victor Stinner684d5fd2012-05-03 02:32:34 +020012352 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012353 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012354 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012355 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012356 }
12357 else {
12358 kind = PyUnicode_KIND(self);
12359 data = PyUnicode_1BYTE_DATA(self);
12360 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012361 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012362 length);
12363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365
12366static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012367do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 Py_ssize_t len, i, j;
12370
12371 if (PyUnicode_READY(self) == -1)
12372 return NULL;
12373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012375
Victor Stinnercc7af722013-04-09 22:39:24 +020012376 if (PyUnicode_IS_ASCII(self)) {
12377 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12378
12379 i = 0;
12380 if (striptype != RIGHTSTRIP) {
12381 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012382 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012383 if (!_Py_ascii_whitespace[ch])
12384 break;
12385 i++;
12386 }
12387 }
12388
12389 j = len;
12390 if (striptype != LEFTSTRIP) {
12391 j--;
12392 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012393 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012394 if (!_Py_ascii_whitespace[ch])
12395 break;
12396 j--;
12397 }
12398 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 }
12400 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012401 else {
12402 int kind = PyUnicode_KIND(self);
12403 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404
Victor Stinnercc7af722013-04-09 22:39:24 +020012405 i = 0;
12406 if (striptype != RIGHTSTRIP) {
12407 while (i < len) {
12408 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12409 if (!Py_UNICODE_ISSPACE(ch))
12410 break;
12411 i++;
12412 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012413 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012414
12415 j = len;
12416 if (striptype != LEFTSTRIP) {
12417 j--;
12418 while (j >= i) {
12419 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12420 if (!Py_UNICODE_ISSPACE(ch))
12421 break;
12422 j--;
12423 }
12424 j++;
12425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427
Victor Stinner7931d9a2011-11-04 00:22:48 +010012428 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012429}
12430
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012431
12432static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012433do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012434{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 if (sep != NULL && sep != Py_None) {
12436 if (PyUnicode_Check(sep))
12437 return _PyUnicode_XStrip(self, striptype, sep);
12438 else {
12439 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 "%s arg must be None or str",
12441 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 return NULL;
12443 }
12444 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012445
Benjamin Peterson14339b62009-01-31 16:36:08 +000012446 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012447}
12448
12449
INADA Naoki3ae20562017-01-16 20:41:20 +090012450/*[clinic input]
12451str.strip as unicode_strip
12452
12453 chars: object = None
12454 /
12455
Victor Stinner0c4a8282017-01-17 02:21:47 +010012456Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012457
12458If chars is given and not None, remove characters in chars instead.
12459[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
12461static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012462unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012463/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464{
INADA Naoki3ae20562017-01-16 20:41:20 +090012465 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012466}
12467
12468
INADA Naoki3ae20562017-01-16 20:41:20 +090012469/*[clinic input]
12470str.lstrip as unicode_lstrip
12471
12472 chars: object = NULL
12473 /
12474
12475Return a copy of the string with leading whitespace removed.
12476
12477If chars is given and not None, remove characters in chars instead.
12478[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479
12480static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012481unicode_lstrip_impl(PyObject *self, PyObject *chars)
12482/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483{
INADA Naoki3ae20562017-01-16 20:41:20 +090012484 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012485}
12486
12487
INADA Naoki3ae20562017-01-16 20:41:20 +090012488/*[clinic input]
12489str.rstrip as unicode_rstrip
12490
12491 chars: object = NULL
12492 /
12493
12494Return a copy of the string with trailing whitespace removed.
12495
12496If chars is given and not None, remove characters in chars instead.
12497[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498
12499static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012500unicode_rstrip_impl(PyObject *self, PyObject *chars)
12501/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502{
INADA Naoki3ae20562017-01-16 20:41:20 +090012503 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012504}
12505
12506
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012508unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012510 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512
Serhiy Storchaka05997252013-01-26 12:14:02 +020012513 if (len < 1)
12514 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
Victor Stinnerc4b49542011-12-11 22:44:26 +010012516 /* no repeat, return original string */
12517 if (len == 1)
12518 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012519
Benjamin Petersonbac79492012-01-14 13:34:47 -050012520 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 return NULL;
12522
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012523 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012524 PyErr_SetString(PyExc_OverflowError,
12525 "repeated string is too long");
12526 return NULL;
12527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012529
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012530 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531 if (!u)
12532 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012533 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 if (PyUnicode_GET_LENGTH(str) == 1) {
12536 const int kind = PyUnicode_KIND(str);
12537 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012538 if (kind == PyUnicode_1BYTE_KIND) {
12539 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012540 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012541 }
12542 else if (kind == PyUnicode_2BYTE_KIND) {
12543 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012544 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012545 ucs2[n] = fill_char;
12546 } else {
12547 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12548 assert(kind == PyUnicode_4BYTE_KIND);
12549 for (n = 0; n < len; ++n)
12550 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 }
12553 else {
12554 /* number of characters copied this far */
12555 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012556 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012558 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012562 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012563 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 }
12566
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012567 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012568 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569}
12570
Alexander Belopolsky40018472011-02-26 01:02:56 +000012571PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012572PyUnicode_Replace(PyObject *str,
12573 PyObject *substr,
12574 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012575 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012577 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12578 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012580 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
INADA Naoki3ae20562017-01-16 20:41:20 +090012583/*[clinic input]
12584str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
INADA Naoki3ae20562017-01-16 20:41:20 +090012586 old: unicode
12587 new: unicode
12588 count: Py_ssize_t = -1
12589 Maximum number of occurrences to replace.
12590 -1 (the default value) means replace all occurrences.
12591 /
12592
12593Return a copy with all occurrences of substring old replaced by new.
12594
12595If the optional argument count is given, only the first count occurrences are
12596replaced.
12597[clinic start generated code]*/
12598
12599static PyObject *
12600unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12601 Py_ssize_t count)
12602/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012604 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012606 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607}
12608
Alexander Belopolsky40018472011-02-26 01:02:56 +000012609static PyObject *
12610unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012612 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 Py_ssize_t isize;
12614 Py_ssize_t osize, squote, dquote, i, o;
12615 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012616 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012620 return NULL;
12621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 isize = PyUnicode_GET_LENGTH(unicode);
12623 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 /* Compute length of output, quote characters, and
12626 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012627 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 max = 127;
12629 squote = dquote = 0;
12630 ikind = PyUnicode_KIND(unicode);
12631 for (i = 0; i < isize; i++) {
12632 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012633 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012635 case '\'': squote++; break;
12636 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012638 incr = 2;
12639 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 default:
12641 /* Fast-path ASCII */
12642 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012645 ;
12646 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012655 if (osize > PY_SSIZE_T_MAX - incr) {
12656 PyErr_SetString(PyExc_OverflowError,
12657 "string is too long to generate repr");
12658 return NULL;
12659 }
12660 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 }
12662
12663 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012664 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012666 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 if (dquote)
12668 /* Both squote and dquote present. Use squote,
12669 and escape them */
12670 osize += squote;
12671 else
12672 quote = '"';
12673 }
Victor Stinner55c08782013-04-14 18:45:39 +020012674 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675
12676 repr = PyUnicode_New(osize, max);
12677 if (repr == NULL)
12678 return NULL;
12679 okind = PyUnicode_KIND(repr);
12680 odata = PyUnicode_DATA(repr);
12681
12682 PyUnicode_WRITE(okind, odata, 0, quote);
12683 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012684 if (unchanged) {
12685 _PyUnicode_FastCopyCharacters(repr, 1,
12686 unicode, 0,
12687 isize);
12688 }
12689 else {
12690 for (i = 0, o = 1; i < isize; i++) {
12691 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692
Victor Stinner55c08782013-04-14 18:45:39 +020012693 /* Escape quotes and backslashes */
12694 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012695 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012697 continue;
12698 }
12699
12700 /* Map special whitespace to '\t', \n', '\r' */
12701 if (ch == '\t') {
12702 PyUnicode_WRITE(okind, odata, o++, '\\');
12703 PyUnicode_WRITE(okind, odata, o++, 't');
12704 }
12705 else if (ch == '\n') {
12706 PyUnicode_WRITE(okind, odata, o++, '\\');
12707 PyUnicode_WRITE(okind, odata, o++, 'n');
12708 }
12709 else if (ch == '\r') {
12710 PyUnicode_WRITE(okind, odata, o++, '\\');
12711 PyUnicode_WRITE(okind, odata, o++, 'r');
12712 }
12713
12714 /* Map non-printable US ASCII to '\xhh' */
12715 else if (ch < ' ' || ch == 0x7F) {
12716 PyUnicode_WRITE(okind, odata, o++, '\\');
12717 PyUnicode_WRITE(okind, odata, o++, 'x');
12718 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12719 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12720 }
12721
12722 /* Copy ASCII characters as-is */
12723 else if (ch < 0x7F) {
12724 PyUnicode_WRITE(okind, odata, o++, ch);
12725 }
12726
12727 /* Non-ASCII characters */
12728 else {
12729 /* Map Unicode whitespace and control characters
12730 (categories Z* and C* except ASCII space)
12731 */
12732 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12733 PyUnicode_WRITE(okind, odata, o++, '\\');
12734 /* Map 8-bit characters to '\xhh' */
12735 if (ch <= 0xff) {
12736 PyUnicode_WRITE(okind, odata, o++, 'x');
12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12738 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12739 }
12740 /* Map 16-bit characters to '\uxxxx' */
12741 else if (ch <= 0xffff) {
12742 PyUnicode_WRITE(okind, odata, o++, 'u');
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12747 }
12748 /* Map 21-bit characters to '\U00xxxxxx' */
12749 else {
12750 PyUnicode_WRITE(okind, odata, o++, 'U');
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12759 }
12760 }
12761 /* Copy characters as-is */
12762 else {
12763 PyUnicode_WRITE(okind, odata, o++, ch);
12764 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012765 }
12766 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012769 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012770 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771}
12772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012773PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775\n\
12776Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012777such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778arguments start and end are interpreted as in slice notation.\n\
12779\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012780Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
12782static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012785 /* initialize variables to prevent gcc warning */
12786 PyObject *substring = NULL;
12787 Py_ssize_t start = 0;
12788 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012791 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012794 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012797 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 if (result == -2)
12800 return NULL;
12801
Christian Heimes217cfd12007-12-02 14:31:20 +000012802 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803}
12804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012805PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012808Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012813 /* initialize variables to prevent gcc warning */
12814 PyObject *substring = NULL;
12815 Py_ssize_t start = 0;
12816 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012819 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (result == -2)
12828 return NULL;
12829
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 if (result < 0) {
12831 PyErr_SetString(PyExc_ValueError, "substring not found");
12832 return NULL;
12833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834
Christian Heimes217cfd12007-12-02 14:31:20 +000012835 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836}
12837
INADA Naoki3ae20562017-01-16 20:41:20 +090012838/*[clinic input]
12839str.rjust as unicode_rjust
12840
12841 width: Py_ssize_t
12842 fillchar: Py_UCS4 = ' '
12843 /
12844
12845Return a right-justified string of length width.
12846
12847Padding is done using the specified fill character (default is a space).
12848[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849
12850static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012851unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12852/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012854 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855 return NULL;
12856
Victor Stinnerc4b49542011-12-11 22:44:26 +010012857 if (PyUnicode_GET_LENGTH(self) >= width)
12858 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859
Victor Stinnerc4b49542011-12-11 22:44:26 +010012860 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Alexander Belopolsky40018472011-02-26 01:02:56 +000012863PyObject *
12864PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012866 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012869 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
INADA Naoki3ae20562017-01-16 20:41:20 +090012872/*[clinic input]
12873str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
INADA Naoki3ae20562017-01-16 20:41:20 +090012875 sep: object = None
12876 The delimiter according which to split the string.
12877 None (the default value) means split according to any whitespace,
12878 and discard empty strings from the result.
12879 maxsplit: Py_ssize_t = -1
12880 Maximum number of splits to do.
12881 -1 (the default value) means no limit.
12882
12883Return a list of the words in the string, using sep as the delimiter string.
12884[clinic start generated code]*/
12885
12886static PyObject *
12887unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12888/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889{
INADA Naoki3ae20562017-01-16 20:41:20 +090012890 if (sep == Py_None)
12891 return split(self, NULL, maxsplit);
12892 if (PyUnicode_Check(sep))
12893 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894
12895 PyErr_Format(PyExc_TypeError,
12896 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012897 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012902PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012905 int kind1, kind2;
12906 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012909 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911
Victor Stinner14f8f022011-10-05 20:58:25 +020012912 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 len1 = PyUnicode_GET_LENGTH(str_obj);
12915 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012916 if (kind1 < kind2 || len1 < len2) {
12917 _Py_INCREF_UNICODE_EMPTY();
12918 if (!unicode_empty)
12919 out = NULL;
12920 else {
12921 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12922 Py_DECREF(unicode_empty);
12923 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012924 return out;
12925 }
12926 buf1 = PyUnicode_DATA(str_obj);
12927 buf2 = PyUnicode_DATA(sep_obj);
12928 if (kind2 != kind1) {
12929 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12930 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012936 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12937 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938 else
12939 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 break;
12941 case PyUnicode_2BYTE_KIND:
12942 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943 break;
12944 case PyUnicode_4BYTE_KIND:
12945 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946 break;
12947 default:
12948 assert(0);
12949 out = 0;
12950 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012952 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012954
12955 return out;
12956}
12957
12958
12959PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012960PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012961{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012962 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012963 int kind1, kind2;
12964 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012967 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012970 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 len1 = PyUnicode_GET_LENGTH(str_obj);
12973 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012974 if (kind1 < kind2 || len1 < len2) {
12975 _Py_INCREF_UNICODE_EMPTY();
12976 if (!unicode_empty)
12977 out = NULL;
12978 else {
12979 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12980 Py_DECREF(unicode_empty);
12981 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012982 return out;
12983 }
12984 buf1 = PyUnicode_DATA(str_obj);
12985 buf2 = PyUnicode_DATA(sep_obj);
12986 if (kind2 != kind1) {
12987 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12988 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012989 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012992 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012994 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12995 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12996 else
12997 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 break;
12999 case PyUnicode_2BYTE_KIND:
13000 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13001 break;
13002 case PyUnicode_4BYTE_KIND:
13003 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13004 break;
13005 default:
13006 assert(0);
13007 out = 0;
13008 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013012
13013 return out;
13014}
13015
INADA Naoki3ae20562017-01-16 20:41:20 +090013016/*[clinic input]
13017str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013018
INADA Naoki3ae20562017-01-16 20:41:20 +090013019 sep: object
13020 /
13021
13022Partition the string into three parts using the given separator.
13023
13024This will search for the separator in the string. If the separator is found,
13025returns a 3-tuple containing the part before the separator, the separator
13026itself, and the part after it.
13027
13028If the separator is not found, returns a 3-tuple containing the original string
13029and two empty strings.
13030[clinic start generated code]*/
13031
13032static PyObject *
13033unicode_partition(PyObject *self, PyObject *sep)
13034/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035{
INADA Naoki3ae20562017-01-16 20:41:20 +090013036 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037}
13038
INADA Naoki3ae20562017-01-16 20:41:20 +090013039/*[clinic input]
13040str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041
INADA Naoki3ae20562017-01-16 20:41:20 +090013042Partition the string into three parts using the given separator.
13043
13044This will search for the separator in the string, starting and the end. If
13045the separator is found, returns a 3-tuple containing the part before the
13046separator, the separator itself, and the part after it.
13047
13048If the separator is not found, returns a 3-tuple containing two empty strings
13049and the original string.
13050[clinic start generated code]*/
13051
13052static PyObject *
13053unicode_rpartition(PyObject *self, PyObject *sep)
13054/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055{
INADA Naoki3ae20562017-01-16 20:41:20 +090013056 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057}
13058
Alexander Belopolsky40018472011-02-26 01:02:56 +000013059PyObject *
13060PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013061{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013062 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013064
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013065 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013066}
13067
INADA Naoki3ae20562017-01-16 20:41:20 +090013068/*[clinic input]
13069str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013070
INADA Naoki3ae20562017-01-16 20:41:20 +090013071Return a list of the words in the string, using sep as the delimiter string.
13072
13073Splits are done starting at the end of the string and working to the front.
13074[clinic start generated code]*/
13075
13076static PyObject *
13077unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13078/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013079{
INADA Naoki3ae20562017-01-16 20:41:20 +090013080 if (sep == Py_None)
13081 return rsplit(self, NULL, maxsplit);
13082 if (PyUnicode_Check(sep))
13083 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013084
13085 PyErr_Format(PyExc_TypeError,
13086 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013087 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013088 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013089}
13090
INADA Naoki3ae20562017-01-16 20:41:20 +090013091/*[clinic input]
13092str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
INADA Naoki3ae20562017-01-16 20:41:20 +090013094 keepends: int(c_default="0") = False
13095
13096Return a list of the lines in the string, breaking at line boundaries.
13097
13098Line breaks are not included in the resulting list unless keepends is given and
13099true.
13100[clinic start generated code]*/
13101
13102static PyObject *
13103unicode_splitlines_impl(PyObject *self, int keepends)
13104/*[clinic end generated code: output=f664dcdad153ec40 input=d6ff99fe43465b0f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013106 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107}
13108
13109static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013110PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013112 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
13114
INADA Naoki3ae20562017-01-16 20:41:20 +090013115/*[clinic input]
13116str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
INADA Naoki3ae20562017-01-16 20:41:20 +090013118Convert uppercase characters to lowercase and lowercase characters to uppercase.
13119[clinic start generated code]*/
13120
13121static PyObject *
13122unicode_swapcase_impl(PyObject *self)
13123/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013125 if (PyUnicode_READY(self) == -1)
13126 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013127 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
13129
Larry Hastings61272b72014-01-07 12:41:53 -080013130/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013131
Larry Hastings31826802013-10-19 00:09:25 -070013132@staticmethod
13133str.maketrans as unicode_maketrans
13134
13135 x: object
13136
13137 y: unicode=NULL
13138
13139 z: unicode=NULL
13140
13141 /
13142
13143Return a translation table usable for str.translate().
13144
13145If there is only one argument, it must be a dictionary mapping Unicode
13146ordinals (integers) or characters to Unicode ordinals, strings or None.
13147Character keys will be then converted to ordinals.
13148If there are two arguments, they must be strings of equal length, and
13149in the resulting dictionary, each character in x will be mapped to the
13150character at the same position in y. If there is a third argument, it
13151must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013152[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013153
Larry Hastings31826802013-10-19 00:09:25 -070013154static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013155unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013156/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013157{
Georg Brandlceee0772007-11-27 23:48:05 +000013158 PyObject *new = NULL, *key, *value;
13159 Py_ssize_t i = 0;
13160 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161
Georg Brandlceee0772007-11-27 23:48:05 +000013162 new = PyDict_New();
13163 if (!new)
13164 return NULL;
13165 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 int x_kind, y_kind, z_kind;
13167 void *x_data, *y_data, *z_data;
13168
Georg Brandlceee0772007-11-27 23:48:05 +000013169 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013170 if (!PyUnicode_Check(x)) {
13171 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13172 "be a string if there is a second argument");
13173 goto err;
13174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013176 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13177 "arguments must have equal length");
13178 goto err;
13179 }
13180 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 x_kind = PyUnicode_KIND(x);
13182 y_kind = PyUnicode_KIND(y);
13183 x_data = PyUnicode_DATA(x);
13184 y_data = PyUnicode_DATA(y);
13185 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13186 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013187 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013188 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013189 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013190 if (!value) {
13191 Py_DECREF(key);
13192 goto err;
13193 }
Georg Brandlceee0772007-11-27 23:48:05 +000013194 res = PyDict_SetItem(new, key, value);
13195 Py_DECREF(key);
13196 Py_DECREF(value);
13197 if (res < 0)
13198 goto err;
13199 }
13200 /* create entries for deleting chars in z */
13201 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 z_kind = PyUnicode_KIND(z);
13203 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013204 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013206 if (!key)
13207 goto err;
13208 res = PyDict_SetItem(new, key, Py_None);
13209 Py_DECREF(key);
13210 if (res < 0)
13211 goto err;
13212 }
13213 }
13214 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 int kind;
13216 void *data;
13217
Georg Brandlceee0772007-11-27 23:48:05 +000013218 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013219 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013220 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13221 "to maketrans it must be a dict");
13222 goto err;
13223 }
13224 /* copy entries into the new dict, converting string keys to int keys */
13225 while (PyDict_Next(x, &i, &key, &value)) {
13226 if (PyUnicode_Check(key)) {
13227 /* convert string keys to integer keys */
13228 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013229 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013230 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13231 "table must be of length 1");
13232 goto err;
13233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 kind = PyUnicode_KIND(key);
13235 data = PyUnicode_DATA(key);
13236 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013237 if (!newkey)
13238 goto err;
13239 res = PyDict_SetItem(new, newkey, value);
13240 Py_DECREF(newkey);
13241 if (res < 0)
13242 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013243 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013244 /* just keep integer keys */
13245 if (PyDict_SetItem(new, key, value) < 0)
13246 goto err;
13247 } else {
13248 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13249 "be strings or integers");
13250 goto err;
13251 }
13252 }
13253 }
13254 return new;
13255 err:
13256 Py_DECREF(new);
13257 return NULL;
13258}
13259
INADA Naoki3ae20562017-01-16 20:41:20 +090013260/*[clinic input]
13261str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262
INADA Naoki3ae20562017-01-16 20:41:20 +090013263 table: object
13264 Translation table, which must be a mapping of Unicode ordinals to
13265 Unicode ordinals, strings, or None.
13266 /
13267
13268Replace each character in the string using the given translation table.
13269
13270The table must implement lookup/indexing via __getitem__, for instance a
13271dictionary or list. If this operation raises LookupError, the character is
13272left untouched. Characters mapped to None are deleted.
13273[clinic start generated code]*/
13274
13275static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013277/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280}
13281
INADA Naoki3ae20562017-01-16 20:41:20 +090013282/*[clinic input]
13283str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
INADA Naoki3ae20562017-01-16 20:41:20 +090013285Return a copy of the string converted to uppercase.
13286[clinic start generated code]*/
13287
13288static PyObject *
13289unicode_upper_impl(PyObject *self)
13290/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013292 if (PyUnicode_READY(self) == -1)
13293 return NULL;
13294 if (PyUnicode_IS_ASCII(self))
13295 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013296 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297}
13298
INADA Naoki3ae20562017-01-16 20:41:20 +090013299/*[clinic input]
13300str.zfill as unicode_zfill
13301
13302 width: Py_ssize_t
13303 /
13304
13305Pad a numeric string with zeros on the left, to fill a field of the given width.
13306
13307The string is never truncated.
13308[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
13310static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013311unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013312/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013314 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013315 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 int kind;
13317 void *data;
13318 Py_UCS4 chr;
13319
Benjamin Petersonbac79492012-01-14 13:34:47 -050013320 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322
Victor Stinnerc4b49542011-12-11 22:44:26 +010013323 if (PyUnicode_GET_LENGTH(self) >= width)
13324 return unicode_result_unchanged(self);
13325
13326 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
13328 u = pad(self, fill, 0, '0');
13329
Walter Dörwald068325e2002-04-15 13:36:47 +000013330 if (u == NULL)
13331 return NULL;
13332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 kind = PyUnicode_KIND(u);
13334 data = PyUnicode_DATA(u);
13335 chr = PyUnicode_READ(kind, data, fill);
13336
13337 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 PyUnicode_WRITE(kind, data, 0, chr);
13340 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341 }
13342
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013343 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013344 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
13347#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013348static PyObject *
13349unicode__decimal2ascii(PyObject *self)
13350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013352}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353#endif
13354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013355PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013358Return True if S starts with the specified prefix, False otherwise.\n\
13359With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013360With optional end, stop comparing S at that position.\n\
13361prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
13363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013364unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013368 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013369 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013370 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013371 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
Jesus Ceaac451502011-04-20 17:09:23 +020013373 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013375 if (PyTuple_Check(subobj)) {
13376 Py_ssize_t i;
13377 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013378 substring = PyTuple_GET_ITEM(subobj, i);
13379 if (!PyUnicode_Check(substring)) {
13380 PyErr_Format(PyExc_TypeError,
13381 "tuple for startswith must only contain str, "
13382 "not %.100s",
13383 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013385 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013387 if (result == -1)
13388 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 if (result) {
13390 Py_RETURN_TRUE;
13391 }
13392 }
13393 /* nothing matched */
13394 Py_RETURN_FALSE;
13395 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 if (!PyUnicode_Check(subobj)) {
13397 PyErr_Format(PyExc_TypeError,
13398 "startswith first arg must be str or "
13399 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013401 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013403 if (result == -1)
13404 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406}
13407
13408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013409PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013412Return True if S ends with the specified suffix, False otherwise.\n\
13413With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013414With optional end, stop comparing S at that position.\n\
13415suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416
13417static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013418unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013422 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013423 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013424 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013425 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426
Jesus Ceaac451502011-04-20 17:09:23 +020013427 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013429 if (PyTuple_Check(subobj)) {
13430 Py_ssize_t i;
13431 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013432 substring = PyTuple_GET_ITEM(subobj, i);
13433 if (!PyUnicode_Check(substring)) {
13434 PyErr_Format(PyExc_TypeError,
13435 "tuple for endswith must only contain str, "
13436 "not %.100s",
13437 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013441 if (result == -1)
13442 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013443 if (result) {
13444 Py_RETURN_TRUE;
13445 }
13446 }
13447 Py_RETURN_FALSE;
13448 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013449 if (!PyUnicode_Check(subobj)) {
13450 PyErr_Format(PyExc_TypeError,
13451 "endswith first arg must be str or "
13452 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013454 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013455 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013456 if (result == -1)
13457 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459}
13460
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013461static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013462_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013463{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013464 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13465 writer->data = PyUnicode_DATA(writer->buffer);
13466
13467 if (!writer->readonly) {
13468 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013469 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013470 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013471 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013472 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13473 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13474 writer->kind = PyUnicode_WCHAR_KIND;
13475 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13476
Victor Stinner8f674cc2013-04-17 23:02:17 +020013477 /* Copy-on-write mode: set buffer size to 0 so
13478 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13479 * next write. */
13480 writer->size = 0;
13481 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013482}
13483
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013485_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013486{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013488
13489 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013490 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013491
13492 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13493 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13494 writer->kind = PyUnicode_WCHAR_KIND;
13495 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013496}
13497
Victor Stinnerd3f08822012-05-29 12:57:52 +020013498int
13499_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13500 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013501{
13502 Py_ssize_t newlen;
13503 PyObject *newbuffer;
13504
Victor Stinner2740e462016-09-06 16:58:36 -070013505 assert(maxchar <= MAX_UNICODE);
13506
Victor Stinnerca9381e2015-09-22 00:58:32 +020013507 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013508 assert((maxchar > writer->maxchar && length >= 0)
13509 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510
Victor Stinner202fdca2012-05-07 12:47:02 +020013511 if (length > PY_SSIZE_T_MAX - writer->pos) {
13512 PyErr_NoMemory();
13513 return -1;
13514 }
13515 newlen = writer->pos + length;
13516
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013517 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013518
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013520 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013521 if (writer->overallocate
13522 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13523 /* overallocate to limit the number of realloc() */
13524 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013526 if (newlen < writer->min_length)
13527 newlen = writer->min_length;
13528
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529 writer->buffer = PyUnicode_New(newlen, maxchar);
13530 if (writer->buffer == NULL)
13531 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013533 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013534 if (writer->overallocate
13535 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13536 /* overallocate to limit the number of realloc() */
13537 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013539 if (newlen < writer->min_length)
13540 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013542 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013544 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013545 newbuffer = PyUnicode_New(newlen, maxchar);
13546 if (newbuffer == NULL)
13547 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013548 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13549 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013551 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013552 }
13553 else {
13554 newbuffer = resize_compact(writer->buffer, newlen);
13555 if (newbuffer == NULL)
13556 return -1;
13557 }
13558 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013559 }
13560 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013561 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 newbuffer = PyUnicode_New(writer->size, maxchar);
13563 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13566 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013567 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013570 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013571
13572#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013573}
13574
Victor Stinnerca9381e2015-09-22 00:58:32 +020013575int
13576_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13577 enum PyUnicode_Kind kind)
13578{
13579 Py_UCS4 maxchar;
13580
13581 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13582 assert(writer->kind < kind);
13583
13584 switch (kind)
13585 {
13586 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13587 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13588 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13589 default:
13590 assert(0 && "invalid kind");
13591 return -1;
13592 }
13593
13594 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13595}
13596
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013597static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013598_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013599{
Victor Stinner2740e462016-09-06 16:58:36 -070013600 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013601 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13602 return -1;
13603 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13604 writer->pos++;
13605 return 0;
13606}
13607
13608int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013609_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13610{
13611 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13612}
13613
13614int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013615_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13616{
13617 Py_UCS4 maxchar;
13618 Py_ssize_t len;
13619
13620 if (PyUnicode_READY(str) == -1)
13621 return -1;
13622 len = PyUnicode_GET_LENGTH(str);
13623 if (len == 0)
13624 return 0;
13625 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13626 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013627 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013628 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013629 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013630 Py_INCREF(str);
13631 writer->buffer = str;
13632 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633 writer->pos += len;
13634 return 0;
13635 }
13636 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13637 return -1;
13638 }
13639 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13640 str, 0, len);
13641 writer->pos += len;
13642 return 0;
13643}
13644
Victor Stinnere215d962012-10-06 23:03:36 +020013645int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013646_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13647 Py_ssize_t start, Py_ssize_t end)
13648{
13649 Py_UCS4 maxchar;
13650 Py_ssize_t len;
13651
13652 if (PyUnicode_READY(str) == -1)
13653 return -1;
13654
13655 assert(0 <= start);
13656 assert(end <= PyUnicode_GET_LENGTH(str));
13657 assert(start <= end);
13658
13659 if (end == 0)
13660 return 0;
13661
13662 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13663 return _PyUnicodeWriter_WriteStr(writer, str);
13664
13665 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13666 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13667 else
13668 maxchar = writer->maxchar;
13669 len = end - start;
13670
13671 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13672 return -1;
13673
13674 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13675 str, start, len);
13676 writer->pos += len;
13677 return 0;
13678}
13679
13680int
Victor Stinner4a587072013-11-19 12:54:53 +010013681_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13682 const char *ascii, Py_ssize_t len)
13683{
13684 if (len == -1)
13685 len = strlen(ascii);
13686
13687 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13688
13689 if (writer->buffer == NULL && !writer->overallocate) {
13690 PyObject *str;
13691
13692 str = _PyUnicode_FromASCII(ascii, len);
13693 if (str == NULL)
13694 return -1;
13695
13696 writer->readonly = 1;
13697 writer->buffer = str;
13698 _PyUnicodeWriter_Update(writer);
13699 writer->pos += len;
13700 return 0;
13701 }
13702
13703 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13704 return -1;
13705
13706 switch (writer->kind)
13707 {
13708 case PyUnicode_1BYTE_KIND:
13709 {
13710 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13711 Py_UCS1 *data = writer->data;
13712
Christian Heimesf051e432016-09-13 20:22:02 +020013713 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013714 break;
13715 }
13716 case PyUnicode_2BYTE_KIND:
13717 {
13718 _PyUnicode_CONVERT_BYTES(
13719 Py_UCS1, Py_UCS2,
13720 ascii, ascii + len,
13721 (Py_UCS2 *)writer->data + writer->pos);
13722 break;
13723 }
13724 case PyUnicode_4BYTE_KIND:
13725 {
13726 _PyUnicode_CONVERT_BYTES(
13727 Py_UCS1, Py_UCS4,
13728 ascii, ascii + len,
13729 (Py_UCS4 *)writer->data + writer->pos);
13730 break;
13731 }
13732 default:
13733 assert(0);
13734 }
13735
13736 writer->pos += len;
13737 return 0;
13738}
13739
13740int
13741_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13742 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013743{
13744 Py_UCS4 maxchar;
13745
13746 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13747 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13748 return -1;
13749 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13750 writer->pos += len;
13751 return 0;
13752}
13753
Victor Stinnerd3f08822012-05-29 12:57:52 +020013754PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013755_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013756{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013757 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013758
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013760 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013761 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013762 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013763
13764 str = writer->buffer;
13765 writer->buffer = NULL;
13766
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013767 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013768 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13769 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013770 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013771
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013772 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13773 PyObject *str2;
13774 str2 = resize_compact(str, writer->pos);
13775 if (str2 == NULL) {
13776 Py_DECREF(str);
13777 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013778 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013779 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013780 }
13781
Victor Stinner15a0bd32013-07-08 22:29:55 +020013782 assert(_PyUnicode_CheckConsistency(str, 1));
13783 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013784}
13785
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013787_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013788{
13789 Py_CLEAR(writer->buffer);
13790}
13791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013792#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013793
13794PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013796\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013797Return a formatted version of S, using substitutions from args and kwargs.\n\
13798The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013799
Eric Smith27bbca62010-11-04 17:06:58 +000013800PyDoc_STRVAR(format_map__doc__,
13801 "S.format_map(mapping) -> str\n\
13802\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013803Return a formatted version of S, using substitutions from mapping.\n\
13804The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013805
INADA Naoki3ae20562017-01-16 20:41:20 +090013806/*[clinic input]
13807str.__format__ as unicode___format__
13808
13809 format_spec: unicode
13810 /
13811
13812Return a formatted version of the string as described by format_spec.
13813[clinic start generated code]*/
13814
Eric Smith4a7d76d2008-05-30 18:10:19 +000013815static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013816unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013817/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013818{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013819 _PyUnicodeWriter writer;
13820 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013821
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822 if (PyUnicode_READY(self) == -1)
13823 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013824 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013825 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13826 self, format_spec, 0,
13827 PyUnicode_GET_LENGTH(format_spec));
13828 if (ret == -1) {
13829 _PyUnicodeWriter_Dealloc(&writer);
13830 return NULL;
13831 }
13832 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013833}
13834
INADA Naoki3ae20562017-01-16 20:41:20 +090013835/*[clinic input]
13836str.__sizeof__ as unicode_sizeof
13837
13838Return the size of the string in memory, in bytes.
13839[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013840
13841static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013842unicode_sizeof_impl(PyObject *self)
13843/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013845 Py_ssize_t size;
13846
13847 /* If it's a compact object, account for base structure +
13848 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 if (PyUnicode_IS_COMPACT_ASCII(self))
13850 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13851 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013852 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013853 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013854 else {
13855 /* If it is a two-block object, account for base object, and
13856 for character block if present. */
13857 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013858 if (_PyUnicode_DATA_ANY(self))
13859 size += (PyUnicode_GET_LENGTH(self) + 1) *
13860 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 }
13862 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013863 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013864 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13865 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13866 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13867 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868
13869 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013870}
13871
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013872static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013873unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013874{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013875 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 if (!copy)
13877 return NULL;
13878 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013879}
13880
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 UNICODE_ENCODE_METHODDEF
13883 UNICODE_REPLACE_METHODDEF
13884 UNICODE_SPLIT_METHODDEF
13885 UNICODE_RSPLIT_METHODDEF
13886 UNICODE_JOIN_METHODDEF
13887 UNICODE_CAPITALIZE_METHODDEF
13888 UNICODE_CASEFOLD_METHODDEF
13889 UNICODE_TITLE_METHODDEF
13890 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013891 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013892 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013893 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013894 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013895 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013896 UNICODE_LJUST_METHODDEF
13897 UNICODE_LOWER_METHODDEF
13898 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013899 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13900 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013901 UNICODE_RJUST_METHODDEF
13902 UNICODE_RSTRIP_METHODDEF
13903 UNICODE_RPARTITION_METHODDEF
13904 UNICODE_SPLITLINES_METHODDEF
13905 UNICODE_STRIP_METHODDEF
13906 UNICODE_SWAPCASE_METHODDEF
13907 UNICODE_TRANSLATE_METHODDEF
13908 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13910 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013911 UNICODE_ISLOWER_METHODDEF
13912 UNICODE_ISUPPER_METHODDEF
13913 UNICODE_ISTITLE_METHODDEF
13914 UNICODE_ISSPACE_METHODDEF
13915 UNICODE_ISDECIMAL_METHODDEF
13916 UNICODE_ISDIGIT_METHODDEF
13917 UNICODE_ISNUMERIC_METHODDEF
13918 UNICODE_ISALPHA_METHODDEF
13919 UNICODE_ISALNUM_METHODDEF
13920 UNICODE_ISIDENTIFIER_METHODDEF
13921 UNICODE_ISPRINTABLE_METHODDEF
13922 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013923 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013924 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013926 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013927 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013928#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013929 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013930 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931#endif
13932
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013934 {NULL, NULL}
13935};
13936
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013937static PyObject *
13938unicode_mod(PyObject *v, PyObject *w)
13939{
Brian Curtindfc80e32011-08-10 20:28:54 -050013940 if (!PyUnicode_Check(v))
13941 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013942 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013943}
13944
13945static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013946 0, /*nb_add*/
13947 0, /*nb_subtract*/
13948 0, /*nb_multiply*/
13949 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013950};
13951
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 (lenfunc) unicode_length, /* sq_length */
13954 PyUnicode_Concat, /* sq_concat */
13955 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13956 (ssizeargfunc) unicode_getitem, /* sq_item */
13957 0, /* sq_slice */
13958 0, /* sq_ass_item */
13959 0, /* sq_ass_slice */
13960 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013961};
13962
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013963static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013964unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013966 if (PyUnicode_READY(self) == -1)
13967 return NULL;
13968
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013969 if (PyIndex_Check(item)) {
13970 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013971 if (i == -1 && PyErr_Occurred())
13972 return NULL;
13973 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013974 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013975 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013976 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013977 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013978 PyObject *result;
13979 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013980 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013981 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013983 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013985 return NULL;
13986 }
13987
13988 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013989 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013991 slicelength == PyUnicode_GET_LENGTH(self)) {
13992 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013993 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013994 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013995 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013997 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013998 src_kind = PyUnicode_KIND(self);
13999 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014000 if (!PyUnicode_IS_ASCII(self)) {
14001 kind_limit = kind_maxchar_limit(src_kind);
14002 max_char = 0;
14003 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14004 ch = PyUnicode_READ(src_kind, src_data, cur);
14005 if (ch > max_char) {
14006 max_char = ch;
14007 if (max_char >= kind_limit)
14008 break;
14009 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014010 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 }
Victor Stinner55c99112011-10-13 01:17:06 +020014012 else
14013 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014015 if (result == NULL)
14016 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014017 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014018 dest_data = PyUnicode_DATA(result);
14019
14020 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014021 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14022 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014023 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014024 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014025 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014026 } else {
14027 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14028 return NULL;
14029 }
14030}
14031
14032static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 (lenfunc)unicode_length, /* mp_length */
14034 (binaryfunc)unicode_subscript, /* mp_subscript */
14035 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014036};
14037
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039/* Helpers for PyUnicode_Format() */
14040
Victor Stinnera47082312012-10-04 02:19:54 +020014041struct unicode_formatter_t {
14042 PyObject *args;
14043 int args_owned;
14044 Py_ssize_t arglen, argidx;
14045 PyObject *dict;
14046
14047 enum PyUnicode_Kind fmtkind;
14048 Py_ssize_t fmtcnt, fmtpos;
14049 void *fmtdata;
14050 PyObject *fmtstr;
14051
14052 _PyUnicodeWriter writer;
14053};
14054
14055struct unicode_format_arg_t {
14056 Py_UCS4 ch;
14057 int flags;
14058 Py_ssize_t width;
14059 int prec;
14060 int sign;
14061};
14062
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014064unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065{
Victor Stinnera47082312012-10-04 02:19:54 +020014066 Py_ssize_t argidx = ctx->argidx;
14067
14068 if (argidx < ctx->arglen) {
14069 ctx->argidx++;
14070 if (ctx->arglen < 0)
14071 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 else
Victor Stinnera47082312012-10-04 02:19:54 +020014073 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074 }
14075 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014076 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077 return NULL;
14078}
14079
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014080/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081
Victor Stinnera47082312012-10-04 02:19:54 +020014082/* Format a float into the writer if the writer is not NULL, or into *p_output
14083 otherwise.
14084
14085 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014086static int
Victor Stinnera47082312012-10-04 02:19:54 +020014087formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14088 PyObject **p_output,
14089 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014091 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014093 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014094 int prec;
14095 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014096
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097 x = PyFloat_AsDouble(v);
14098 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014100
Victor Stinnera47082312012-10-04 02:19:54 +020014101 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014104
Victor Stinnera47082312012-10-04 02:19:54 +020014105 if (arg->flags & F_ALT)
14106 dtoa_flags = Py_DTSF_ALT;
14107 else
14108 dtoa_flags = 0;
14109 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014110 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014111 return -1;
14112 len = strlen(p);
14113 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014114 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014115 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014117 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118 }
14119 else
14120 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014121 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014122 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123}
14124
Victor Stinnerd0880d52012-04-27 23:40:13 +020014125/* formatlong() emulates the format codes d, u, o, x and X, and
14126 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14127 * Python's regular ints.
14128 * Return value: a new PyUnicodeObject*, or NULL if error.
14129 * The output string is of the form
14130 * "-"? ("0x" | "0X")? digit+
14131 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14132 * set in flags. The case of hex digits will be correct,
14133 * There will be at least prec digits, zero-filled on the left if
14134 * necessary to get that many.
14135 * val object to be converted
14136 * flags bitmask of format flags; only F_ALT is looked at
14137 * prec minimum number of digits; 0-fill on left if needed
14138 * type a character in [duoxX]; u acts the same as d
14139 *
14140 * CAUTION: o, x and X conversions on regular ints can never
14141 * produce a '-' sign, but can for Python's unbounded ints.
14142 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014143PyObject *
14144_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014145{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014148 Py_ssize_t i;
14149 int sign; /* 1 if '-', else 0 */
14150 int len; /* number of characters */
14151 Py_ssize_t llen;
14152 int numdigits; /* len == numnondigits + numdigits */
14153 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014154
Victor Stinnerd0880d52012-04-27 23:40:13 +020014155 /* Avoid exceeding SSIZE_T_MAX */
14156 if (prec > INT_MAX-3) {
14157 PyErr_SetString(PyExc_OverflowError,
14158 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160 }
14161
14162 assert(PyLong_Check(val));
14163
14164 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 default:
14166 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014168 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014170 /* int and int subclasses should print numerically when a numeric */
14171 /* format code is used (see issue18780) */
14172 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 break;
14174 case 'o':
14175 numnondigits = 2;
14176 result = PyNumber_ToBase(val, 8);
14177 break;
14178 case 'x':
14179 case 'X':
14180 numnondigits = 2;
14181 result = PyNumber_ToBase(val, 16);
14182 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014183 }
14184 if (!result)
14185 return NULL;
14186
14187 assert(unicode_modifiable(result));
14188 assert(PyUnicode_IS_READY(result));
14189 assert(PyUnicode_IS_ASCII(result));
14190
14191 /* To modify the string in-place, there can only be one reference. */
14192 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014193 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 PyErr_BadInternalCall();
14195 return NULL;
14196 }
14197 buf = PyUnicode_DATA(result);
14198 llen = PyUnicode_GET_LENGTH(result);
14199 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014200 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014202 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014203 return NULL;
14204 }
14205 len = (int)llen;
14206 sign = buf[0] == '-';
14207 numnondigits += sign;
14208 numdigits = len - numnondigits;
14209 assert(numdigits > 0);
14210
14211 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014212 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014213 (type == 'o' || type == 'x' || type == 'X'))) {
14214 assert(buf[sign] == '0');
14215 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14216 buf[sign+1] == 'o');
14217 numnondigits -= 2;
14218 buf += 2;
14219 len -= 2;
14220 if (sign)
14221 buf[0] = '-';
14222 assert(len == numnondigits + numdigits);
14223 assert(numdigits > 0);
14224 }
14225
14226 /* Fill with leading zeroes to meet minimum width. */
14227 if (prec > numdigits) {
14228 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14229 numnondigits + prec);
14230 char *b1;
14231 if (!r1) {
14232 Py_DECREF(result);
14233 return NULL;
14234 }
14235 b1 = PyBytes_AS_STRING(r1);
14236 for (i = 0; i < numnondigits; ++i)
14237 *b1++ = *buf++;
14238 for (i = 0; i < prec - numdigits; i++)
14239 *b1++ = '0';
14240 for (i = 0; i < numdigits; i++)
14241 *b1++ = *buf++;
14242 *b1 = '\0';
14243 Py_DECREF(result);
14244 result = r1;
14245 buf = PyBytes_AS_STRING(result);
14246 len = numnondigits + prec;
14247 }
14248
14249 /* Fix up case for hex conversions. */
14250 if (type == 'X') {
14251 /* Need to convert all lower case letters to upper case.
14252 and need to convert 0x to 0X (and -0x to -0X). */
14253 for (i = 0; i < len; i++)
14254 if (buf[i] >= 'a' && buf[i] <= 'x')
14255 buf[i] -= 'a'-'A';
14256 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014257 if (!PyUnicode_Check(result)
14258 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014259 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014260 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014261 Py_DECREF(result);
14262 result = unicode;
14263 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 else if (len != PyUnicode_GET_LENGTH(result)) {
14265 if (PyUnicode_Resize(&result, len) < 0)
14266 Py_CLEAR(result);
14267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014269}
14270
Ethan Furmandf3ed242014-01-05 06:50:30 -080014271/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014272 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014273 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274 * -1 and raise an exception on error */
14275static int
Victor Stinnera47082312012-10-04 02:19:54 +020014276mainformatlong(PyObject *v,
14277 struct unicode_format_arg_t *arg,
14278 PyObject **p_output,
14279 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280{
14281 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014282 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283
14284 if (!PyNumber_Check(v))
14285 goto wrongtype;
14286
Ethan Furman9ab74802014-03-21 06:38:46 -070014287 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014289 if (type == 'o' || type == 'x' || type == 'X') {
14290 iobj = PyNumber_Index(v);
14291 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014292 if (PyErr_ExceptionMatches(PyExc_TypeError))
14293 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014294 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014295 }
14296 }
14297 else {
14298 iobj = PyNumber_Long(v);
14299 if (iobj == NULL ) {
14300 if (PyErr_ExceptionMatches(PyExc_TypeError))
14301 goto wrongtype;
14302 return -1;
14303 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 }
14305 assert(PyLong_Check(iobj));
14306 }
14307 else {
14308 iobj = v;
14309 Py_INCREF(iobj);
14310 }
14311
14312 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014313 && arg->width == -1 && arg->prec == -1
14314 && !(arg->flags & (F_SIGN | F_BLANK))
14315 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014316 {
14317 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014318 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014319 int base;
14320
Victor Stinnera47082312012-10-04 02:19:54 +020014321 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 {
14323 default:
14324 assert(0 && "'type' not in [diuoxX]");
14325 case 'd':
14326 case 'i':
14327 case 'u':
14328 base = 10;
14329 break;
14330 case 'o':
14331 base = 8;
14332 break;
14333 case 'x':
14334 case 'X':
14335 base = 16;
14336 break;
14337 }
14338
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014339 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14340 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014341 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014342 }
14343 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014344 return 1;
14345 }
14346
Ethan Furmanb95b5612015-01-23 20:05:18 -080014347 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 Py_DECREF(iobj);
14349 if (res == NULL)
14350 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014351 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014352 return 0;
14353
14354wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014355 switch(type)
14356 {
14357 case 'o':
14358 case 'x':
14359 case 'X':
14360 PyErr_Format(PyExc_TypeError,
14361 "%%%c format: an integer is required, "
14362 "not %.200s",
14363 type, Py_TYPE(v)->tp_name);
14364 break;
14365 default:
14366 PyErr_Format(PyExc_TypeError,
14367 "%%%c format: a number is required, "
14368 "not %.200s",
14369 type, Py_TYPE(v)->tp_name);
14370 break;
14371 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014372 return -1;
14373}
14374
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014375static Py_UCS4
14376formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014377{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014378 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014379 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014381 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014382 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 goto onError;
14384 }
14385 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014386 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014387 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014388 /* make sure number is a type of integer */
14389 if (!PyLong_Check(v)) {
14390 iobj = PyNumber_Index(v);
14391 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014392 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014394 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014395 Py_DECREF(iobj);
14396 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014397 else {
14398 x = PyLong_AsLong(v);
14399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 if (x == -1 && PyErr_Occurred())
14401 goto onError;
14402
Victor Stinner8faf8212011-12-08 22:14:11 +010014403 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 PyErr_SetString(PyExc_OverflowError,
14405 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014406 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 }
14408
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014409 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014411
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014413 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014415 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416}
14417
Victor Stinnera47082312012-10-04 02:19:54 +020014418/* Parse options of an argument: flags, width, precision.
14419 Handle also "%(name)" syntax.
14420
14421 Return 0 if the argument has been formatted into arg->str.
14422 Return 1 if the argument has been written into ctx->writer,
14423 Raise an exception and return -1 on error. */
14424static int
14425unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14426 struct unicode_format_arg_t *arg)
14427{
14428#define FORMAT_READ(ctx) \
14429 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14430
14431 PyObject *v;
14432
Victor Stinnera47082312012-10-04 02:19:54 +020014433 if (arg->ch == '(') {
14434 /* Get argument value from a dictionary. Example: "%(name)s". */
14435 Py_ssize_t keystart;
14436 Py_ssize_t keylen;
14437 PyObject *key;
14438 int pcount = 1;
14439
14440 if (ctx->dict == NULL) {
14441 PyErr_SetString(PyExc_TypeError,
14442 "format requires a mapping");
14443 return -1;
14444 }
14445 ++ctx->fmtpos;
14446 --ctx->fmtcnt;
14447 keystart = ctx->fmtpos;
14448 /* Skip over balanced parentheses */
14449 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14450 arg->ch = FORMAT_READ(ctx);
14451 if (arg->ch == ')')
14452 --pcount;
14453 else if (arg->ch == '(')
14454 ++pcount;
14455 ctx->fmtpos++;
14456 }
14457 keylen = ctx->fmtpos - keystart - 1;
14458 if (ctx->fmtcnt < 0 || pcount > 0) {
14459 PyErr_SetString(PyExc_ValueError,
14460 "incomplete format key");
14461 return -1;
14462 }
14463 key = PyUnicode_Substring(ctx->fmtstr,
14464 keystart, keystart + keylen);
14465 if (key == NULL)
14466 return -1;
14467 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014468 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014469 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014470 }
14471 ctx->args = PyObject_GetItem(ctx->dict, key);
14472 Py_DECREF(key);
14473 if (ctx->args == NULL)
14474 return -1;
14475 ctx->args_owned = 1;
14476 ctx->arglen = -1;
14477 ctx->argidx = -2;
14478 }
14479
14480 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014481 while (--ctx->fmtcnt >= 0) {
14482 arg->ch = FORMAT_READ(ctx);
14483 ctx->fmtpos++;
14484 switch (arg->ch) {
14485 case '-': arg->flags |= F_LJUST; continue;
14486 case '+': arg->flags |= F_SIGN; continue;
14487 case ' ': arg->flags |= F_BLANK; continue;
14488 case '#': arg->flags |= F_ALT; continue;
14489 case '0': arg->flags |= F_ZERO; continue;
14490 }
14491 break;
14492 }
14493
14494 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014495 if (arg->ch == '*') {
14496 v = unicode_format_getnextarg(ctx);
14497 if (v == NULL)
14498 return -1;
14499 if (!PyLong_Check(v)) {
14500 PyErr_SetString(PyExc_TypeError,
14501 "* wants int");
14502 return -1;
14503 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014504 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014505 if (arg->width == -1 && PyErr_Occurred())
14506 return -1;
14507 if (arg->width < 0) {
14508 arg->flags |= F_LJUST;
14509 arg->width = -arg->width;
14510 }
14511 if (--ctx->fmtcnt >= 0) {
14512 arg->ch = FORMAT_READ(ctx);
14513 ctx->fmtpos++;
14514 }
14515 }
14516 else if (arg->ch >= '0' && arg->ch <= '9') {
14517 arg->width = arg->ch - '0';
14518 while (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 if (arg->ch < '0' || arg->ch > '9')
14522 break;
14523 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14524 mixing signed and unsigned comparison. Since arg->ch is between
14525 '0' and '9', casting to int is safe. */
14526 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14527 PyErr_SetString(PyExc_ValueError,
14528 "width too big");
14529 return -1;
14530 }
14531 arg->width = arg->width*10 + (arg->ch - '0');
14532 }
14533 }
14534
14535 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014536 if (arg->ch == '.') {
14537 arg->prec = 0;
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 if (arg->ch == '*') {
14543 v = unicode_format_getnextarg(ctx);
14544 if (v == NULL)
14545 return -1;
14546 if (!PyLong_Check(v)) {
14547 PyErr_SetString(PyExc_TypeError,
14548 "* wants int");
14549 return -1;
14550 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014551 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014552 if (arg->prec == -1 && PyErr_Occurred())
14553 return -1;
14554 if (arg->prec < 0)
14555 arg->prec = 0;
14556 if (--ctx->fmtcnt >= 0) {
14557 arg->ch = FORMAT_READ(ctx);
14558 ctx->fmtpos++;
14559 }
14560 }
14561 else if (arg->ch >= '0' && arg->ch <= '9') {
14562 arg->prec = arg->ch - '0';
14563 while (--ctx->fmtcnt >= 0) {
14564 arg->ch = FORMAT_READ(ctx);
14565 ctx->fmtpos++;
14566 if (arg->ch < '0' || arg->ch > '9')
14567 break;
14568 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14569 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014570 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014571 return -1;
14572 }
14573 arg->prec = arg->prec*10 + (arg->ch - '0');
14574 }
14575 }
14576 }
14577
14578 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14579 if (ctx->fmtcnt >= 0) {
14580 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14581 if (--ctx->fmtcnt >= 0) {
14582 arg->ch = FORMAT_READ(ctx);
14583 ctx->fmtpos++;
14584 }
14585 }
14586 }
14587 if (ctx->fmtcnt < 0) {
14588 PyErr_SetString(PyExc_ValueError,
14589 "incomplete format");
14590 return -1;
14591 }
14592 return 0;
14593
14594#undef FORMAT_READ
14595}
14596
14597/* Format one argument. Supported conversion specifiers:
14598
14599 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014600 - "i", "d", "u": int or float
14601 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014602 - "e", "E", "f", "F", "g", "G": float
14603 - "c": int or str (1 character)
14604
Victor Stinner8dbd4212012-12-04 09:30:24 +010014605 When possible, the output is written directly into the Unicode writer
14606 (ctx->writer). A string is created when padding is required.
14607
Victor Stinnera47082312012-10-04 02:19:54 +020014608 Return 0 if the argument has been formatted into *p_str,
14609 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014610 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014611static int
14612unicode_format_arg_format(struct unicode_formatter_t *ctx,
14613 struct unicode_format_arg_t *arg,
14614 PyObject **p_str)
14615{
14616 PyObject *v;
14617 _PyUnicodeWriter *writer = &ctx->writer;
14618
14619 if (ctx->fmtcnt == 0)
14620 ctx->writer.overallocate = 0;
14621
14622 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014623 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014624 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014625 return 1;
14626 }
14627
14628 v = unicode_format_getnextarg(ctx);
14629 if (v == NULL)
14630 return -1;
14631
Victor Stinnera47082312012-10-04 02:19:54 +020014632
14633 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014634 case 's':
14635 case 'r':
14636 case 'a':
14637 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14638 /* Fast path */
14639 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14640 return -1;
14641 return 1;
14642 }
14643
14644 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14645 *p_str = v;
14646 Py_INCREF(*p_str);
14647 }
14648 else {
14649 if (arg->ch == 's')
14650 *p_str = PyObject_Str(v);
14651 else if (arg->ch == 'r')
14652 *p_str = PyObject_Repr(v);
14653 else
14654 *p_str = PyObject_ASCII(v);
14655 }
14656 break;
14657
14658 case 'i':
14659 case 'd':
14660 case 'u':
14661 case 'o':
14662 case 'x':
14663 case 'X':
14664 {
14665 int ret = mainformatlong(v, arg, p_str, writer);
14666 if (ret != 0)
14667 return ret;
14668 arg->sign = 1;
14669 break;
14670 }
14671
14672 case 'e':
14673 case 'E':
14674 case 'f':
14675 case 'F':
14676 case 'g':
14677 case 'G':
14678 if (arg->width == -1 && arg->prec == -1
14679 && !(arg->flags & (F_SIGN | F_BLANK)))
14680 {
14681 /* Fast path */
14682 if (formatfloat(v, arg, NULL, writer) == -1)
14683 return -1;
14684 return 1;
14685 }
14686
14687 arg->sign = 1;
14688 if (formatfloat(v, arg, p_str, NULL) == -1)
14689 return -1;
14690 break;
14691
14692 case 'c':
14693 {
14694 Py_UCS4 ch = formatchar(v);
14695 if (ch == (Py_UCS4) -1)
14696 return -1;
14697 if (arg->width == -1 && arg->prec == -1) {
14698 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014699 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014700 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014701 return 1;
14702 }
14703 *p_str = PyUnicode_FromOrdinal(ch);
14704 break;
14705 }
14706
14707 default:
14708 PyErr_Format(PyExc_ValueError,
14709 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014710 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014711 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14712 (int)arg->ch,
14713 ctx->fmtpos - 1);
14714 return -1;
14715 }
14716 if (*p_str == NULL)
14717 return -1;
14718 assert (PyUnicode_Check(*p_str));
14719 return 0;
14720}
14721
14722static int
14723unicode_format_arg_output(struct unicode_formatter_t *ctx,
14724 struct unicode_format_arg_t *arg,
14725 PyObject *str)
14726{
14727 Py_ssize_t len;
14728 enum PyUnicode_Kind kind;
14729 void *pbuf;
14730 Py_ssize_t pindex;
14731 Py_UCS4 signchar;
14732 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014733 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014734 Py_ssize_t sublen;
14735 _PyUnicodeWriter *writer = &ctx->writer;
14736 Py_UCS4 fill;
14737
14738 fill = ' ';
14739 if (arg->sign && arg->flags & F_ZERO)
14740 fill = '0';
14741
14742 if (PyUnicode_READY(str) == -1)
14743 return -1;
14744
14745 len = PyUnicode_GET_LENGTH(str);
14746 if ((arg->width == -1 || arg->width <= len)
14747 && (arg->prec == -1 || arg->prec >= len)
14748 && !(arg->flags & (F_SIGN | F_BLANK)))
14749 {
14750 /* Fast path */
14751 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14752 return -1;
14753 return 0;
14754 }
14755
14756 /* Truncate the string for "s", "r" and "a" formats
14757 if the precision is set */
14758 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14759 if (arg->prec >= 0 && len > arg->prec)
14760 len = arg->prec;
14761 }
14762
14763 /* Adjust sign and width */
14764 kind = PyUnicode_KIND(str);
14765 pbuf = PyUnicode_DATA(str);
14766 pindex = 0;
14767 signchar = '\0';
14768 if (arg->sign) {
14769 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14770 if (ch == '-' || ch == '+') {
14771 signchar = ch;
14772 len--;
14773 pindex++;
14774 }
14775 else if (arg->flags & F_SIGN)
14776 signchar = '+';
14777 else if (arg->flags & F_BLANK)
14778 signchar = ' ';
14779 else
14780 arg->sign = 0;
14781 }
14782 if (arg->width < len)
14783 arg->width = len;
14784
14785 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014786 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014787 if (!(arg->flags & F_LJUST)) {
14788 if (arg->sign) {
14789 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014790 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014791 }
14792 else {
14793 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014794 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014795 }
14796 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014797 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14798 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014799 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014800 }
14801
Victor Stinnera47082312012-10-04 02:19:54 +020014802 buflen = arg->width;
14803 if (arg->sign && len == arg->width)
14804 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014805 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014806 return -1;
14807
14808 /* Write the sign if needed */
14809 if (arg->sign) {
14810 if (fill != ' ') {
14811 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14812 writer->pos += 1;
14813 }
14814 if (arg->width > len)
14815 arg->width--;
14816 }
14817
14818 /* Write the numeric prefix for "x", "X" and "o" formats
14819 if the alternate form is used.
14820 For example, write "0x" for the "%#x" format. */
14821 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14822 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14823 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14824 if (fill != ' ') {
14825 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14826 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14827 writer->pos += 2;
14828 pindex += 2;
14829 }
14830 arg->width -= 2;
14831 if (arg->width < 0)
14832 arg->width = 0;
14833 len -= 2;
14834 }
14835
14836 /* Pad left with the fill character if needed */
14837 if (arg->width > len && !(arg->flags & F_LJUST)) {
14838 sublen = arg->width - len;
14839 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14840 writer->pos += sublen;
14841 arg->width = len;
14842 }
14843
14844 /* If padding with spaces: write sign if needed and/or numeric prefix if
14845 the alternate form is used */
14846 if (fill == ' ') {
14847 if (arg->sign) {
14848 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14849 writer->pos += 1;
14850 }
14851 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14852 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14853 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14856 writer->pos += 2;
14857 pindex += 2;
14858 }
14859 }
14860
14861 /* Write characters */
14862 if (len) {
14863 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14864 str, pindex, len);
14865 writer->pos += len;
14866 }
14867
14868 /* Pad right with the fill character if needed */
14869 if (arg->width > len) {
14870 sublen = arg->width - len;
14871 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14872 writer->pos += sublen;
14873 }
14874 return 0;
14875}
14876
14877/* Helper of PyUnicode_Format(): format one arg.
14878 Return 0 on success, raise an exception and return -1 on error. */
14879static int
14880unicode_format_arg(struct unicode_formatter_t *ctx)
14881{
14882 struct unicode_format_arg_t arg;
14883 PyObject *str;
14884 int ret;
14885
Victor Stinner8dbd4212012-12-04 09:30:24 +010014886 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14887 arg.flags = 0;
14888 arg.width = -1;
14889 arg.prec = -1;
14890 arg.sign = 0;
14891 str = NULL;
14892
Victor Stinnera47082312012-10-04 02:19:54 +020014893 ret = unicode_format_arg_parse(ctx, &arg);
14894 if (ret == -1)
14895 return -1;
14896
14897 ret = unicode_format_arg_format(ctx, &arg, &str);
14898 if (ret == -1)
14899 return -1;
14900
14901 if (ret != 1) {
14902 ret = unicode_format_arg_output(ctx, &arg, str);
14903 Py_DECREF(str);
14904 if (ret == -1)
14905 return -1;
14906 }
14907
14908 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14909 PyErr_SetString(PyExc_TypeError,
14910 "not all arguments converted during string formatting");
14911 return -1;
14912 }
14913 return 0;
14914}
14915
Alexander Belopolsky40018472011-02-26 01:02:56 +000014916PyObject *
14917PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014918{
Victor Stinnera47082312012-10-04 02:19:54 +020014919 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014920
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014922 PyErr_BadInternalCall();
14923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014924 }
Victor Stinnera47082312012-10-04 02:19:54 +020014925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014926 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014927 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014928
14929 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014930 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14931 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14932 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14933 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014934
Victor Stinner8f674cc2013-04-17 23:02:17 +020014935 _PyUnicodeWriter_Init(&ctx.writer);
14936 ctx.writer.min_length = ctx.fmtcnt + 100;
14937 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014938
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014940 ctx.arglen = PyTuple_Size(args);
14941 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014942 }
14943 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014944 ctx.arglen = -1;
14945 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946 }
Victor Stinnera47082312012-10-04 02:19:54 +020014947 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014948 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014949 ctx.dict = args;
14950 else
14951 ctx.dict = NULL;
14952 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953
Victor Stinnera47082312012-10-04 02:19:54 +020014954 while (--ctx.fmtcnt >= 0) {
14955 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014956 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014957
14958 nonfmtpos = ctx.fmtpos++;
14959 while (ctx.fmtcnt >= 0 &&
14960 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14961 ctx.fmtpos++;
14962 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 }
Victor Stinnera47082312012-10-04 02:19:54 +020014964 if (ctx.fmtcnt < 0) {
14965 ctx.fmtpos--;
14966 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014967 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014968
Victor Stinnercfc4c132013-04-03 01:48:39 +020014969 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14970 nonfmtpos, ctx.fmtpos) < 0)
14971 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 }
14973 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014974 ctx.fmtpos++;
14975 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014976 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014977 }
14978 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014979
Victor Stinnera47082312012-10-04 02:19:54 +020014980 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014981 PyErr_SetString(PyExc_TypeError,
14982 "not all arguments converted during string formatting");
14983 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014984 }
14985
Victor Stinnera47082312012-10-04 02:19:54 +020014986 if (ctx.args_owned) {
14987 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988 }
Victor Stinnera47082312012-10-04 02:19:54 +020014989 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990
Benjamin Peterson29060642009-01-31 22:14:21 +000014991 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014992 _PyUnicodeWriter_Dealloc(&ctx.writer);
14993 if (ctx.args_owned) {
14994 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014995 }
14996 return NULL;
14997}
14998
Jeremy Hylton938ace62002-07-17 16:30:39 +000014999static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015000unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15001
Tim Peters6d6c1a32001-08-02 04:15:00 +000015002static PyObject *
15003unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15004{
Benjamin Peterson29060642009-01-31 22:14:21 +000015005 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 static char *kwlist[] = {"object", "encoding", "errors", 0};
15007 char *encoding = NULL;
15008 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015009
Benjamin Peterson14339b62009-01-31 16:36:08 +000015010 if (type != &PyUnicode_Type)
15011 return unicode_subtype_new(type, args, kwds);
15012 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015013 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 return NULL;
15015 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015016 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 if (encoding == NULL && errors == NULL)
15018 return PyObject_Str(x);
15019 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015020 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015021}
15022
Guido van Rossume023fe02001-08-30 03:12:59 +000015023static PyObject *
15024unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15025{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015026 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015027 Py_ssize_t length, char_size;
15028 int share_wstr, share_utf8;
15029 unsigned int kind;
15030 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015031
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015034 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015035 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015037 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015038 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015039 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015040 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015041 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015042
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015043 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015044 if (self == NULL) {
15045 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 return NULL;
15047 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015048 kind = PyUnicode_KIND(unicode);
15049 length = PyUnicode_GET_LENGTH(unicode);
15050
15051 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015052#ifdef Py_DEBUG
15053 _PyUnicode_HASH(self) = -1;
15054#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015056#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 _PyUnicode_STATE(self).interned = 0;
15058 _PyUnicode_STATE(self).kind = kind;
15059 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015060 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061 _PyUnicode_STATE(self).ready = 1;
15062 _PyUnicode_WSTR(self) = NULL;
15063 _PyUnicode_UTF8_LENGTH(self) = 0;
15064 _PyUnicode_UTF8(self) = NULL;
15065 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015066 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015067
15068 share_utf8 = 0;
15069 share_wstr = 0;
15070 if (kind == PyUnicode_1BYTE_KIND) {
15071 char_size = 1;
15072 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15073 share_utf8 = 1;
15074 }
15075 else if (kind == PyUnicode_2BYTE_KIND) {
15076 char_size = 2;
15077 if (sizeof(wchar_t) == 2)
15078 share_wstr = 1;
15079 }
15080 else {
15081 assert(kind == PyUnicode_4BYTE_KIND);
15082 char_size = 4;
15083 if (sizeof(wchar_t) == 4)
15084 share_wstr = 1;
15085 }
15086
15087 /* Ensure we won't overflow the length. */
15088 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15089 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015090 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092 data = PyObject_MALLOC((length + 1) * char_size);
15093 if (data == NULL) {
15094 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015095 goto onError;
15096 }
15097
Victor Stinnerc3c74152011-10-02 20:39:55 +020015098 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015099 if (share_utf8) {
15100 _PyUnicode_UTF8_LENGTH(self) = length;
15101 _PyUnicode_UTF8(self) = data;
15102 }
15103 if (share_wstr) {
15104 _PyUnicode_WSTR_LENGTH(self) = length;
15105 _PyUnicode_WSTR(self) = (wchar_t *)data;
15106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107
Christian Heimesf051e432016-09-13 20:22:02 +020015108 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015109 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015110 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015111#ifdef Py_DEBUG
15112 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15113#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015114 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015115 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015116
15117onError:
15118 Py_DECREF(unicode);
15119 Py_DECREF(self);
15120 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015121}
15122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015123PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015124"str(object='') -> str\n\
15125str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015126\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015127Create a new string object from the given object. If encoding or\n\
15128errors is specified, then the object must expose a data buffer\n\
15129that will be decoded using the given encoding and error handler.\n\
15130Otherwise, returns the result of object.__str__() (if defined)\n\
15131or repr(object).\n\
15132encoding defaults to sys.getdefaultencoding().\n\
15133errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015134
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015135static PyObject *unicode_iter(PyObject *seq);
15136
Guido van Rossumd57fd912000-03-10 22:53:23 +000015137PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015138 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 "str", /* tp_name */
15140 sizeof(PyUnicodeObject), /* tp_size */
15141 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015142 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 (destructor)unicode_dealloc, /* tp_dealloc */
15144 0, /* tp_print */
15145 0, /* tp_getattr */
15146 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015147 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 unicode_repr, /* tp_repr */
15149 &unicode_as_number, /* tp_as_number */
15150 &unicode_as_sequence, /* tp_as_sequence */
15151 &unicode_as_mapping, /* tp_as_mapping */
15152 (hashfunc) unicode_hash, /* tp_hash*/
15153 0, /* tp_call*/
15154 (reprfunc) unicode_str, /* tp_str */
15155 PyObject_GenericGetAttr, /* tp_getattro */
15156 0, /* tp_setattro */
15157 0, /* tp_as_buffer */
15158 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015159 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 unicode_doc, /* tp_doc */
15161 0, /* tp_traverse */
15162 0, /* tp_clear */
15163 PyUnicode_RichCompare, /* tp_richcompare */
15164 0, /* tp_weaklistoffset */
15165 unicode_iter, /* tp_iter */
15166 0, /* tp_iternext */
15167 unicode_methods, /* tp_methods */
15168 0, /* tp_members */
15169 0, /* tp_getset */
15170 &PyBaseObject_Type, /* tp_base */
15171 0, /* tp_dict */
15172 0, /* tp_descr_get */
15173 0, /* tp_descr_set */
15174 0, /* tp_dictoffset */
15175 0, /* tp_init */
15176 0, /* tp_alloc */
15177 unicode_new, /* tp_new */
15178 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015179};
15180
15181/* Initialize the Unicode implementation */
15182
Victor Stinner3a50e702011-10-18 21:21:00 +020015183int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015184{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015185 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015186 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015187 0x000A, /* LINE FEED */
15188 0x000D, /* CARRIAGE RETURN */
15189 0x001C, /* FILE SEPARATOR */
15190 0x001D, /* GROUP SEPARATOR */
15191 0x001E, /* RECORD SEPARATOR */
15192 0x0085, /* NEXT LINE */
15193 0x2028, /* LINE SEPARATOR */
15194 0x2029, /* PARAGRAPH SEPARATOR */
15195 };
15196
Fred Drakee4315f52000-05-09 19:53:39 +000015197 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015198 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015199 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015200 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015201 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015202
Guido van Rossumcacfc072002-05-24 19:01:59 +000015203 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015204 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015205
15206 /* initialize the linebreak bloom filter */
15207 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015209 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015210
Christian Heimes26532f72013-07-20 14:57:16 +020015211 if (PyType_Ready(&EncodingMapType) < 0)
15212 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015213
Benjamin Petersonc4311282012-10-30 23:21:10 -040015214 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15215 Py_FatalError("Can't initialize field name iterator type");
15216
15217 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15218 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015219
Victor Stinner3a50e702011-10-18 21:21:00 +020015220 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015221}
15222
15223/* Finalize the Unicode implementation */
15224
Christian Heimesa156e092008-02-16 07:38:31 +000015225int
15226PyUnicode_ClearFreeList(void)
15227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015228 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015229}
15230
Guido van Rossumd57fd912000-03-10 22:53:23 +000015231void
Thomas Wouters78890102000-07-22 19:25:51 +000015232_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015233{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015234 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015235
Serhiy Storchaka05997252013-01-26 12:14:02 +020015236 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015237
Serhiy Storchaka05997252013-01-26 12:14:02 +020015238 for (i = 0; i < 256; i++)
15239 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015240 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015241 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015242}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015243
Walter Dörwald16807132007-05-25 13:52:07 +000015244void
15245PyUnicode_InternInPlace(PyObject **p)
15246{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015247 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015249#ifdef Py_DEBUG
15250 assert(s != NULL);
15251 assert(_PyUnicode_CHECK(s));
15252#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015254 return;
15255#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 /* If it's a subclass, we don't really know what putting
15257 it in the interned dict might do. */
15258 if (!PyUnicode_CheckExact(s))
15259 return;
15260 if (PyUnicode_CHECK_INTERNED(s))
15261 return;
15262 if (interned == NULL) {
15263 interned = PyDict_New();
15264 if (interned == NULL) {
15265 PyErr_Clear(); /* Don't leave an exception */
15266 return;
15267 }
15268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015270 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015272 if (t == NULL) {
15273 PyErr_Clear();
15274 return;
15275 }
15276 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015277 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015278 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015279 return;
15280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 /* The two references in interned are not counted by refcnt.
15282 The deallocator will take care of this */
15283 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015284 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015285}
15286
15287void
15288PyUnicode_InternImmortal(PyObject **p)
15289{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 PyUnicode_InternInPlace(p);
15291 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015292 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 Py_INCREF(*p);
15294 }
Walter Dörwald16807132007-05-25 13:52:07 +000015295}
15296
15297PyObject *
15298PyUnicode_InternFromString(const char *cp)
15299{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 PyObject *s = PyUnicode_FromString(cp);
15301 if (s == NULL)
15302 return NULL;
15303 PyUnicode_InternInPlace(&s);
15304 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015305}
15306
Alexander Belopolsky40018472011-02-26 01:02:56 +000015307void
15308_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015311 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 Py_ssize_t i, n;
15313 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015314
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 if (interned == NULL || !PyDict_Check(interned))
15316 return;
15317 keys = PyDict_Keys(interned);
15318 if (keys == NULL || !PyList_Check(keys)) {
15319 PyErr_Clear();
15320 return;
15321 }
Walter Dörwald16807132007-05-25 13:52:07 +000015322
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15324 detector, interned unicode strings are not forcibly deallocated;
15325 rather, we give them their stolen references back, and then clear
15326 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015327
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 n = PyList_GET_SIZE(keys);
15329 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015330 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015332 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015333 if (PyUnicode_READY(s) == -1) {
15334 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015335 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015337 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 case SSTATE_NOT_INTERNED:
15339 /* XXX Shouldn't happen */
15340 break;
15341 case SSTATE_INTERNED_IMMORTAL:
15342 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 break;
15345 case SSTATE_INTERNED_MORTAL:
15346 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015347 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 break;
15349 default:
15350 Py_FatalError("Inconsistent interned string state.");
15351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015352 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 }
15354 fprintf(stderr, "total size of all interned strings: "
15355 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15356 "mortal/immortal\n", mortal_size, immortal_size);
15357 Py_DECREF(keys);
15358 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015359 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015360}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015361
15362
15363/********************* Unicode Iterator **************************/
15364
15365typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 PyObject_HEAD
15367 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015368 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015369} unicodeiterobject;
15370
15371static void
15372unicodeiter_dealloc(unicodeiterobject *it)
15373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 _PyObject_GC_UNTRACK(it);
15375 Py_XDECREF(it->it_seq);
15376 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377}
15378
15379static int
15380unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15381{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 Py_VISIT(it->it_seq);
15383 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015384}
15385
15386static PyObject *
15387unicodeiter_next(unicodeiterobject *it)
15388{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015389 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015390
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 assert(it != NULL);
15392 seq = it->it_seq;
15393 if (seq == NULL)
15394 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015395 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015397 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15398 int kind = PyUnicode_KIND(seq);
15399 void *data = PyUnicode_DATA(seq);
15400 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15401 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 if (item != NULL)
15403 ++it->it_index;
15404 return item;
15405 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015406
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015408 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410}
15411
15412static PyObject *
15413unicodeiter_len(unicodeiterobject *it)
15414{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 Py_ssize_t len = 0;
15416 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015417 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419}
15420
15421PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15422
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015423static PyObject *
15424unicodeiter_reduce(unicodeiterobject *it)
15425{
15426 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015427 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015428 it->it_seq, it->it_index);
15429 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015430 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015431 if (u == NULL)
15432 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015433 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015434 }
15435}
15436
15437PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15438
15439static PyObject *
15440unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15441{
15442 Py_ssize_t index = PyLong_AsSsize_t(state);
15443 if (index == -1 && PyErr_Occurred())
15444 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015445 if (it->it_seq != NULL) {
15446 if (index < 0)
15447 index = 0;
15448 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15449 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15450 it->it_index = index;
15451 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015452 Py_RETURN_NONE;
15453}
15454
15455PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15456
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015459 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15461 reduce_doc},
15462 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15463 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015464 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015465};
15466
15467PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15469 "str_iterator", /* tp_name */
15470 sizeof(unicodeiterobject), /* tp_basicsize */
15471 0, /* tp_itemsize */
15472 /* methods */
15473 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15474 0, /* tp_print */
15475 0, /* tp_getattr */
15476 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015477 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 0, /* tp_repr */
15479 0, /* tp_as_number */
15480 0, /* tp_as_sequence */
15481 0, /* tp_as_mapping */
15482 0, /* tp_hash */
15483 0, /* tp_call */
15484 0, /* tp_str */
15485 PyObject_GenericGetAttr, /* tp_getattro */
15486 0, /* tp_setattro */
15487 0, /* tp_as_buffer */
15488 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15489 0, /* tp_doc */
15490 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15491 0, /* tp_clear */
15492 0, /* tp_richcompare */
15493 0, /* tp_weaklistoffset */
15494 PyObject_SelfIter, /* tp_iter */
15495 (iternextfunc)unicodeiter_next, /* tp_iternext */
15496 unicodeiter_methods, /* tp_methods */
15497 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015498};
15499
15500static PyObject *
15501unicode_iter(PyObject *seq)
15502{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015504
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 if (!PyUnicode_Check(seq)) {
15506 PyErr_BadInternalCall();
15507 return NULL;
15508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015509 if (PyUnicode_READY(seq) == -1)
15510 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15512 if (it == NULL)
15513 return NULL;
15514 it->it_index = 0;
15515 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015516 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 _PyObject_GC_TRACK(it);
15518 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015519}
15520
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015521
15522size_t
15523Py_UNICODE_strlen(const Py_UNICODE *u)
15524{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015525 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015526}
15527
15528Py_UNICODE*
15529Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15530{
15531 Py_UNICODE *u = s1;
15532 while ((*u++ = *s2++));
15533 return s1;
15534}
15535
15536Py_UNICODE*
15537Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15538{
15539 Py_UNICODE *u = s1;
15540 while ((*u++ = *s2++))
15541 if (n-- == 0)
15542 break;
15543 return s1;
15544}
15545
15546Py_UNICODE*
15547Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15548{
15549 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015550 u1 += wcslen(u1);
15551 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015552 return s1;
15553}
15554
15555int
15556Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15557{
15558 while (*s1 && *s2 && *s1 == *s2)
15559 s1++, s2++;
15560 if (*s1 && *s2)
15561 return (*s1 < *s2) ? -1 : +1;
15562 if (*s1)
15563 return 1;
15564 if (*s2)
15565 return -1;
15566 return 0;
15567}
15568
15569int
15570Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15571{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015572 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015573 for (; n != 0; n--) {
15574 u1 = *s1;
15575 u2 = *s2;
15576 if (u1 != u2)
15577 return (u1 < u2) ? -1 : +1;
15578 if (u1 == '\0')
15579 return 0;
15580 s1++;
15581 s2++;
15582 }
15583 return 0;
15584}
15585
15586Py_UNICODE*
15587Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15588{
15589 const Py_UNICODE *p;
15590 for (p = s; *p; p++)
15591 if (*p == c)
15592 return (Py_UNICODE*)p;
15593 return NULL;
15594}
15595
15596Py_UNICODE*
15597Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15598{
15599 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015600 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015601 while (p != s) {
15602 p--;
15603 if (*p == c)
15604 return (Py_UNICODE*)p;
15605 }
15606 return NULL;
15607}
Victor Stinner331ea922010-08-10 16:37:20 +000015608
Victor Stinner71133ff2010-09-01 23:43:53 +000015609Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015610PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015611{
Victor Stinner577db2c2011-10-11 22:12:48 +020015612 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015613 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015615 if (!PyUnicode_Check(unicode)) {
15616 PyErr_BadArgument();
15617 return NULL;
15618 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015619 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015620 if (u == NULL)
15621 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015622 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015623 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015624 PyErr_NoMemory();
15625 return NULL;
15626 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015627 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015628 size *= sizeof(Py_UNICODE);
15629 copy = PyMem_Malloc(size);
15630 if (copy == NULL) {
15631 PyErr_NoMemory();
15632 return NULL;
15633 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015634 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015635 return copy;
15636}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015637
Georg Brandl66c221e2010-10-14 07:04:07 +000015638/* A _string module, to export formatter_parser and formatter_field_name_split
15639 to the string.Formatter class implemented in Python. */
15640
15641static PyMethodDef _string_methods[] = {
15642 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15643 METH_O, PyDoc_STR("split the argument as a field name")},
15644 {"formatter_parser", (PyCFunction) formatter_parser,
15645 METH_O, PyDoc_STR("parse the argument as a format string")},
15646 {NULL, NULL}
15647};
15648
15649static struct PyModuleDef _string_module = {
15650 PyModuleDef_HEAD_INIT,
15651 "_string",
15652 PyDoc_STR("string helper module"),
15653 0,
15654 _string_methods,
15655 NULL,
15656 NULL,
15657 NULL,
15658 NULL
15659};
15660
15661PyMODINIT_FUNC
15662PyInit__string(void)
15663{
15664 return PyModule_Create(&_string_module);
15665}
15666
15667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015668#ifdef __cplusplus
15669}
15670#endif