blob: f7b2aa6c3df8908961a1aabc42beea48f817e916 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001797 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001798
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 default:
1800 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001801 }
1802
Victor Stinner03490912011-10-03 23:45:12 +02001803 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001805 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001806 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001807 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1808 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001810 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811}
1812
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001813#ifdef Py_DEBUG
1814static int
1815unicode_is_singleton(PyObject *unicode)
1816{
1817 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1818 if (unicode == unicode_empty)
1819 return 1;
1820 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1821 {
1822 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1823 if (ch < 256 && unicode_latin1[ch] == unicode)
1824 return 1;
1825 }
1826 return 0;
1827}
1828#endif
1829
Alexander Belopolsky40018472011-02-26 01:02:56 +00001830static int
Victor Stinner488fa492011-12-12 00:01:39 +01001831unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832{
Victor Stinner488fa492011-12-12 00:01:39 +01001833 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001834 if (Py_REFCNT(unicode) != 1)
1835 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001836 if (_PyUnicode_HASH(unicode) != -1)
1837 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001838 if (PyUnicode_CHECK_INTERNED(unicode))
1839 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001840 if (!PyUnicode_CheckExact(unicode))
1841 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001842#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001843 /* singleton refcount is greater than 1 */
1844 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001845#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 return 1;
1847}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001848
Victor Stinnerfe226c02011-10-03 03:52:20 +02001849static int
1850unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1851{
1852 PyObject *unicode;
1853 Py_ssize_t old_length;
1854
1855 assert(p_unicode != NULL);
1856 unicode = *p_unicode;
1857
1858 assert(unicode != NULL);
1859 assert(PyUnicode_Check(unicode));
1860 assert(0 <= length);
1861
Victor Stinner910337b2011-10-03 03:20:16 +02001862 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 old_length = PyUnicode_WSTR_LENGTH(unicode);
1864 else
1865 old_length = PyUnicode_GET_LENGTH(unicode);
1866 if (old_length == length)
1867 return 0;
1868
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001869 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001870 _Py_INCREF_UNICODE_EMPTY();
1871 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001873 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001874 return 0;
1875 }
1876
Victor Stinner488fa492011-12-12 00:01:39 +01001877 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 PyObject *copy = resize_copy(unicode, length);
1879 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001880 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001881 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001882 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001883 }
1884
Victor Stinnerfe226c02011-10-03 03:52:20 +02001885 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001886 PyObject *new_unicode = resize_compact(unicode, length);
1887 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001889 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001890 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001891 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001892 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001893}
1894
Alexander Belopolsky40018472011-02-26 01:02:56 +00001895int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001896PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001897{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001898 PyObject *unicode;
1899 if (p_unicode == NULL) {
1900 PyErr_BadInternalCall();
1901 return -1;
1902 }
1903 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001904 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001905 {
1906 PyErr_BadInternalCall();
1907 return -1;
1908 }
1909 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001910}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001911
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001912/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001913
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001914 WARNING: The function doesn't copy the terminating null character and
1915 doesn't check the maximum character (may write a latin1 character in an
1916 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001917static void
1918unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1919 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001920{
1921 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1922 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001923 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001924
1925 switch (kind) {
1926 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001927 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001928#ifdef Py_DEBUG
1929 if (PyUnicode_IS_ASCII(unicode)) {
1930 Py_UCS4 maxchar = ucs1lib_find_max_char(
1931 (const Py_UCS1*)str,
1932 (const Py_UCS1*)str + len);
1933 assert(maxchar < 128);
1934 }
1935#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001936 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001937 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001938 }
1939 case PyUnicode_2BYTE_KIND: {
1940 Py_UCS2 *start = (Py_UCS2 *)data + index;
1941 Py_UCS2 *ucs2 = start;
1942 assert(index <= PyUnicode_GET_LENGTH(unicode));
1943
Victor Stinner184252a2012-06-16 02:57:41 +02001944 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945 *ucs2 = (Py_UCS2)*str;
1946
1947 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001948 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001949 }
1950 default: {
1951 Py_UCS4 *start = (Py_UCS4 *)data + index;
1952 Py_UCS4 *ucs4 = start;
1953 assert(kind == PyUnicode_4BYTE_KIND);
1954 assert(index <= PyUnicode_GET_LENGTH(unicode));
1955
Victor Stinner184252a2012-06-16 02:57:41 +02001956 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001957 *ucs4 = (Py_UCS4)*str;
1958
1959 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001960 }
1961 }
1962}
1963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964static PyObject*
1965get_latin1_char(unsigned char ch)
1966{
Victor Stinnera464fc12011-10-02 20:39:30 +02001967 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001969 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 if (!unicode)
1971 return NULL;
1972 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 unicode_latin1[ch] = unicode;
1975 }
1976 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001977 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978}
1979
Victor Stinner985a82a2014-01-03 12:53:47 +01001980static PyObject*
1981unicode_char(Py_UCS4 ch)
1982{
1983 PyObject *unicode;
1984
1985 assert(ch <= MAX_UNICODE);
1986
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001987 if (ch < 256)
1988 return get_latin1_char(ch);
1989
Victor Stinner985a82a2014-01-03 12:53:47 +01001990 unicode = PyUnicode_New(1, ch);
1991 if (unicode == NULL)
1992 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001993
1994 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1995 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001996 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001997 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001998 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1999 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2000 }
2001 assert(_PyUnicode_CheckConsistency(unicode, 1));
2002 return unicode;
2003}
2004
Alexander Belopolsky40018472011-02-26 01:02:56 +00002005PyObject *
2006PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002008 if (u == NULL)
2009 return (PyObject*)_PyUnicode_New(size);
2010
2011 if (size < 0) {
2012 PyErr_BadInternalCall();
2013 return NULL;
2014 }
2015
2016 return PyUnicode_FromWideChar(u, size);
2017}
2018
2019PyObject *
2020PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2021{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002022 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 Py_UCS4 maxchar = 0;
2024 Py_ssize_t num_surrogates;
2025
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002026 if (u == NULL && size != 0) {
2027 PyErr_BadInternalCall();
2028 return NULL;
2029 }
2030
2031 if (size == -1) {
2032 size = wcslen(u);
2033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002035 /* If the Unicode data is known at construction time, we can apply
2036 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002039 if (size == 0)
2040 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 /* Single character Unicode objects in the Latin-1 range are
2043 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002044 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 return get_latin1_char((unsigned char)*u);
2046
2047 /* If not empty and not single character, copy the Unicode data
2048 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002049 if (find_maxchar_surrogates(u, u + size,
2050 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 return NULL;
2052
Victor Stinner8faf8212011-12-08 22:14:11 +01002053 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 if (!unicode)
2055 return NULL;
2056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 switch (PyUnicode_KIND(unicode)) {
2058 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002059 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2061 break;
2062 case PyUnicode_2BYTE_KIND:
2063#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002064 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002066 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2068#endif
2069 break;
2070 case PyUnicode_4BYTE_KIND:
2071#if SIZEOF_WCHAR_T == 2
2072 /* This is the only case which has to process surrogates, thus
2073 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002074 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075#else
2076 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002077 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002078#endif
2079 break;
2080 default:
2081 assert(0 && "Impossible state");
2082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002084 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085}
2086
Alexander Belopolsky40018472011-02-26 01:02:56 +00002087PyObject *
2088PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002089{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002090 if (size < 0) {
2091 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002092 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002093 return NULL;
2094 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002095 if (u != NULL)
2096 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2097 else
2098 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002099}
2100
Alexander Belopolsky40018472011-02-26 01:02:56 +00002101PyObject *
2102PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002103{
2104 size_t size = strlen(u);
2105 if (size > PY_SSIZE_T_MAX) {
2106 PyErr_SetString(PyExc_OverflowError, "input too long");
2107 return NULL;
2108 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002109 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002110}
2111
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002112PyObject *
2113_PyUnicode_FromId(_Py_Identifier *id)
2114{
2115 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002116 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2117 strlen(id->string),
2118 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002119 if (!id->object)
2120 return NULL;
2121 PyUnicode_InternInPlace(&id->object);
2122 assert(!id->next);
2123 id->next = static_strings;
2124 static_strings = id;
2125 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002126 return id->object;
2127}
2128
2129void
2130_PyUnicode_ClearStaticStrings()
2131{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002132 _Py_Identifier *tmp, *s = static_strings;
2133 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002134 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002135 tmp = s->next;
2136 s->next = NULL;
2137 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002138 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002139 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002140}
2141
Benjamin Peterson0df54292012-03-26 14:50:32 -04002142/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002143
Victor Stinnerd3f08822012-05-29 12:57:52 +02002144PyObject*
2145_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002146{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002147 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002148 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002149 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002150#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002151 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002152#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002153 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002154 }
Victor Stinner785938e2011-12-11 20:09:03 +01002155 unicode = PyUnicode_New(size, 127);
2156 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002157 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002158 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2159 assert(_PyUnicode_CheckConsistency(unicode, 1));
2160 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002161}
2162
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002163static Py_UCS4
2164kind_maxchar_limit(unsigned int kind)
2165{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002166 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002167 case PyUnicode_1BYTE_KIND:
2168 return 0x80;
2169 case PyUnicode_2BYTE_KIND:
2170 return 0x100;
2171 case PyUnicode_4BYTE_KIND:
2172 return 0x10000;
2173 default:
2174 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002175 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002176 }
2177}
2178
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002179static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002180align_maxchar(Py_UCS4 maxchar)
2181{
2182 if (maxchar <= 127)
2183 return 127;
2184 else if (maxchar <= 255)
2185 return 255;
2186 else if (maxchar <= 65535)
2187 return 65535;
2188 else
2189 return MAX_UNICODE;
2190}
2191
Victor Stinner702c7342011-10-05 13:50:52 +02002192static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002193_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002196 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002197
Serhiy Storchaka678db842013-01-26 12:16:36 +02002198 if (size == 0)
2199 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002200 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002201 if (size == 1)
2202 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002203
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002204 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002205 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 if (!res)
2207 return NULL;
2208 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002209 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002211}
2212
Victor Stinnere57b1c02011-09-28 22:20:48 +02002213static PyObject*
2214_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215{
2216 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002218
Serhiy Storchaka678db842013-01-26 12:16:36 +02002219 if (size == 0)
2220 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002221 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002222 if (size == 1)
2223 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002224
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002225 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002226 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 if (!res)
2228 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002231 else {
2232 _PyUnicode_CONVERT_BYTES(
2233 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2234 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002235 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 return res;
2237}
2238
Victor Stinnere57b1c02011-09-28 22:20:48 +02002239static PyObject*
2240_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241{
2242 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002243 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244
Serhiy Storchaka678db842013-01-26 12:16:36 +02002245 if (size == 0)
2246 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002247 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002248 if (size == 1)
2249 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002250
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002251 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002252 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 if (!res)
2254 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002255 if (max_char < 256)
2256 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2257 PyUnicode_1BYTE_DATA(res));
2258 else if (max_char < 0x10000)
2259 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2260 PyUnicode_2BYTE_DATA(res));
2261 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002263 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 return res;
2265}
2266
2267PyObject*
2268PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2269{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002270 if (size < 0) {
2271 PyErr_SetString(PyExc_ValueError, "size must be positive");
2272 return NULL;
2273 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002274 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002276 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002278 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002280 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002282 PyErr_SetString(PyExc_SystemError, "invalid kind");
2283 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285}
2286
Victor Stinnerece58de2012-04-23 23:36:38 +02002287Py_UCS4
2288_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2289{
2290 enum PyUnicode_Kind kind;
2291 void *startptr, *endptr;
2292
2293 assert(PyUnicode_IS_READY(unicode));
2294 assert(0 <= start);
2295 assert(end <= PyUnicode_GET_LENGTH(unicode));
2296 assert(start <= end);
2297
2298 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2299 return PyUnicode_MAX_CHAR_VALUE(unicode);
2300
2301 if (start == end)
2302 return 127;
2303
Victor Stinner94d558b2012-04-27 22:26:58 +02002304 if (PyUnicode_IS_ASCII(unicode))
2305 return 127;
2306
Victor Stinnerece58de2012-04-23 23:36:38 +02002307 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002308 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002309 endptr = (char *)startptr + end * kind;
2310 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002311 switch(kind) {
2312 case PyUnicode_1BYTE_KIND:
2313 return ucs1lib_find_max_char(startptr, endptr);
2314 case PyUnicode_2BYTE_KIND:
2315 return ucs2lib_find_max_char(startptr, endptr);
2316 case PyUnicode_4BYTE_KIND:
2317 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002319 assert(0);
2320 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002321 }
2322}
2323
Victor Stinner25a4b292011-10-06 12:31:55 +02002324/* Ensure that a string uses the most efficient storage, if it is not the
2325 case: create a new string with of the right kind. Write NULL into *p_unicode
2326 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002327static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002328unicode_adjust_maxchar(PyObject **p_unicode)
2329{
2330 PyObject *unicode, *copy;
2331 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002333 unsigned int kind;
2334
2335 assert(p_unicode != NULL);
2336 unicode = *p_unicode;
2337 assert(PyUnicode_IS_READY(unicode));
2338 if (PyUnicode_IS_ASCII(unicode))
2339 return;
2340
2341 len = PyUnicode_GET_LENGTH(unicode);
2342 kind = PyUnicode_KIND(unicode);
2343 if (kind == PyUnicode_1BYTE_KIND) {
2344 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002345 max_char = ucs1lib_find_max_char(u, u + len);
2346 if (max_char >= 128)
2347 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002348 }
2349 else if (kind == PyUnicode_2BYTE_KIND) {
2350 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002351 max_char = ucs2lib_find_max_char(u, u + len);
2352 if (max_char >= 256)
2353 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002354 }
2355 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002356 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002357 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + len);
2359 if (max_char >= 0x10000)
2360 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002362 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002363 if (copy != NULL)
2364 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002365 Py_DECREF(unicode);
2366 *p_unicode = copy;
2367}
2368
Victor Stinner034f6cf2011-09-30 02:26:44 +02002369PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002370_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002371{
Victor Stinner87af4f22011-11-21 23:03:47 +01002372 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002374
Victor Stinner034f6cf2011-09-30 02:26:44 +02002375 if (!PyUnicode_Check(unicode)) {
2376 PyErr_BadInternalCall();
2377 return NULL;
2378 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002379 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002380 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002381
Victor Stinner87af4f22011-11-21 23:03:47 +01002382 length = PyUnicode_GET_LENGTH(unicode);
2383 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002384 if (!copy)
2385 return NULL;
2386 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2387
Christian Heimesf051e432016-09-13 20:22:02 +02002388 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002389 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002390 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002391 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002392}
2393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395/* Widen Unicode objects to larger buffers. Don't write terminating null
2396 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397
2398void*
2399_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2400{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002401 Py_ssize_t len;
2402 void *result;
2403 unsigned int skind;
2404
Benjamin Petersonbac79492012-01-14 13:34:47 -05002405 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002406 return NULL;
2407
2408 len = PyUnicode_GET_LENGTH(s);
2409 skind = PyUnicode_KIND(s);
2410 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002414 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002416 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002417 if (!result)
2418 return PyErr_NoMemory();
2419 assert(skind == PyUnicode_1BYTE_KIND);
2420 _PyUnicode_CONVERT_BYTES(
2421 Py_UCS1, Py_UCS2,
2422 PyUnicode_1BYTE_DATA(s),
2423 PyUnicode_1BYTE_DATA(s) + len,
2424 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002426 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002427 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002428 if (!result)
2429 return PyErr_NoMemory();
2430 if (skind == PyUnicode_2BYTE_KIND) {
2431 _PyUnicode_CONVERT_BYTES(
2432 Py_UCS2, Py_UCS4,
2433 PyUnicode_2BYTE_DATA(s),
2434 PyUnicode_2BYTE_DATA(s) + len,
2435 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002437 else {
2438 assert(skind == PyUnicode_1BYTE_KIND);
2439 _PyUnicode_CONVERT_BYTES(
2440 Py_UCS1, Py_UCS4,
2441 PyUnicode_1BYTE_DATA(s),
2442 PyUnicode_1BYTE_DATA(s) + len,
2443 result);
2444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002446 default:
2447 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 }
Victor Stinner01698042011-10-04 00:04:26 +02002449 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 return NULL;
2451}
2452
2453static Py_UCS4*
2454as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2455 int copy_null)
2456{
2457 int kind;
2458 void *data;
2459 Py_ssize_t len, targetlen;
2460 if (PyUnicode_READY(string) == -1)
2461 return NULL;
2462 kind = PyUnicode_KIND(string);
2463 data = PyUnicode_DATA(string);
2464 len = PyUnicode_GET_LENGTH(string);
2465 targetlen = len;
2466 if (copy_null)
2467 targetlen++;
2468 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002469 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002470 if (!target) {
2471 PyErr_NoMemory();
2472 return NULL;
2473 }
2474 }
2475 else {
2476 if (targetsize < targetlen) {
2477 PyErr_Format(PyExc_SystemError,
2478 "string is longer than the buffer");
2479 if (copy_null && 0 < targetsize)
2480 target[0] = 0;
2481 return NULL;
2482 }
2483 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002484 if (kind == PyUnicode_1BYTE_KIND) {
2485 Py_UCS1 *start = (Py_UCS1 *) data;
2486 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002488 else if (kind == PyUnicode_2BYTE_KIND) {
2489 Py_UCS2 *start = (Py_UCS2 *) data;
2490 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2491 }
2492 else {
2493 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002494 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 if (copy_null)
2497 target[len] = 0;
2498 return target;
2499}
2500
2501Py_UCS4*
2502PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2503 int copy_null)
2504{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002505 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 PyErr_BadInternalCall();
2507 return NULL;
2508 }
2509 return as_ucs4(string, target, targetsize, copy_null);
2510}
2511
2512Py_UCS4*
2513PyUnicode_AsUCS4Copy(PyObject *string)
2514{
2515 return as_ucs4(string, NULL, 0, 1);
2516}
2517
Victor Stinner15a11362012-10-06 23:48:20 +02002518/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002519 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2520 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2521#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002522
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002523static int
2524unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2525 Py_ssize_t width, Py_ssize_t precision)
2526{
2527 Py_ssize_t length, fill, arglen;
2528 Py_UCS4 maxchar;
2529
2530 if (PyUnicode_READY(str) == -1)
2531 return -1;
2532
2533 length = PyUnicode_GET_LENGTH(str);
2534 if ((precision == -1 || precision >= length)
2535 && width <= length)
2536 return _PyUnicodeWriter_WriteStr(writer, str);
2537
2538 if (precision != -1)
2539 length = Py_MIN(precision, length);
2540
2541 arglen = Py_MAX(length, width);
2542 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2543 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2544 else
2545 maxchar = writer->maxchar;
2546
2547 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2548 return -1;
2549
2550 if (width > length) {
2551 fill = width - length;
2552 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2553 return -1;
2554 writer->pos += fill;
2555 }
2556
2557 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2558 str, 0, length);
2559 writer->pos += length;
2560 return 0;
2561}
2562
2563static int
2564unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2565 Py_ssize_t width, Py_ssize_t precision)
2566{
2567 /* UTF-8 */
2568 Py_ssize_t length;
2569 PyObject *unicode;
2570 int res;
2571
2572 length = strlen(str);
2573 if (precision != -1)
2574 length = Py_MIN(length, precision);
2575 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2576 if (unicode == NULL)
2577 return -1;
2578
2579 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2580 Py_DECREF(unicode);
2581 return res;
2582}
2583
Victor Stinner96865452011-03-01 23:44:09 +00002584static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002585unicode_fromformat_arg(_PyUnicodeWriter *writer,
2586 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002587{
Victor Stinnere215d962012-10-06 23:03:36 +02002588 const char *p;
2589 Py_ssize_t len;
2590 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002591 Py_ssize_t width;
2592 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002593 int longflag;
2594 int longlongflag;
2595 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002597
2598 p = f;
2599 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002600 zeropad = 0;
2601 if (*f == '0') {
2602 zeropad = 1;
2603 f++;
2604 }
Victor Stinner96865452011-03-01 23:44:09 +00002605
2606 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 width = -1;
2608 if (Py_ISDIGIT((unsigned)*f)) {
2609 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002610 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002611 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002613 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002615 return NULL;
2616 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002618 f++;
2619 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620 }
2621 precision = -1;
2622 if (*f == '.') {
2623 f++;
2624 if (Py_ISDIGIT((unsigned)*f)) {
2625 precision = (*f - '0');
2626 f++;
2627 while (Py_ISDIGIT((unsigned)*f)) {
2628 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629 PyErr_SetString(PyExc_ValueError,
2630 "precision too big");
2631 return NULL;
2632 }
2633 precision = (precision * 10) + (*f - '0');
2634 f++;
2635 }
2636 }
Victor Stinner96865452011-03-01 23:44:09 +00002637 if (*f == '%') {
2638 /* "%.3%s" => f points to "3" */
2639 f--;
2640 }
2641 }
2642 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002643 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002644 f--;
2645 }
Victor Stinner96865452011-03-01 23:44:09 +00002646
2647 /* Handle %ld, %lu, %lld and %llu. */
2648 longflag = 0;
2649 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002650 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002651 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002652 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002653 longflag = 1;
2654 ++f;
2655 }
Victor Stinner96865452011-03-01 23:44:09 +00002656 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002657 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002658 longlongflag = 1;
2659 f += 2;
2660 }
Victor Stinner96865452011-03-01 23:44:09 +00002661 }
2662 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002663 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002664 size_tflag = 1;
2665 ++f;
2666 }
Victor Stinnere215d962012-10-06 23:03:36 +02002667
2668 if (f[1] == '\0')
2669 writer->overallocate = 0;
2670
2671 switch (*f) {
2672 case 'c':
2673 {
2674 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002675 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002676 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002677 "character argument not in range(0x110000)");
2678 return NULL;
2679 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002680 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002681 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002682 break;
2683 }
2684
2685 case 'i':
2686 case 'd':
2687 case 'u':
2688 case 'x':
2689 {
2690 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002691 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002692 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002693
2694 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002695 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002699 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002700 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002701 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, size_t));
2704 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002705 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002706 va_arg(*vargs, unsigned int));
2707 }
2708 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002709 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002710 }
2711 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002712 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002713 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002714 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002715 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002716 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002717 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002718 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002720 va_arg(*vargs, Py_ssize_t));
2721 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002722 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_arg(*vargs, int));
2724 }
2725 assert(len >= 0);
2726
Victor Stinnere215d962012-10-06 23:03:36 +02002727 if (precision < len)
2728 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729
2730 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2732 return NULL;
2733
Victor Stinnere215d962012-10-06 23:03:36 +02002734 if (width > precision) {
2735 Py_UCS4 fillchar;
2736 fill = width - precision;
2737 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002738 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2739 return NULL;
2740 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002741 }
Victor Stinner15a11362012-10-06 23:48:20 +02002742 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002743 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002744 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2745 return NULL;
2746 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002747 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002748
Victor Stinner4a587072013-11-19 12:54:53 +01002749 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2750 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002751 break;
2752 }
2753
2754 case 'p':
2755 {
2756 char number[MAX_LONG_LONG_CHARS];
2757
2758 len = sprintf(number, "%p", va_arg(*vargs, void*));
2759 assert(len >= 0);
2760
2761 /* %p is ill-defined: ensure leading 0x. */
2762 if (number[1] == 'X')
2763 number[1] = 'x';
2764 else if (number[1] != 'x') {
2765 memmove(number + 2, number,
2766 strlen(number) + 1);
2767 number[0] = '0';
2768 number[1] = 'x';
2769 len += 2;
2770 }
2771
Victor Stinner4a587072013-11-19 12:54:53 +01002772 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002773 return NULL;
2774 break;
2775 }
2776
2777 case 's':
2778 {
2779 /* UTF-8 */
2780 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002782 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 break;
2784 }
2785
2786 case 'U':
2787 {
2788 PyObject *obj = va_arg(*vargs, PyObject *);
2789 assert(obj && _PyUnicode_CHECK(obj));
2790
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
2793 break;
2794 }
2795
2796 case 'V':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002800 if (obj) {
2801 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002803 return NULL;
2804 }
2805 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 assert(str != NULL);
2807 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002808 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002809 }
2810 break;
2811 }
2812
2813 case 'S':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *str;
2817 assert(obj);
2818 str = PyObject_Str(obj);
2819 if (!str)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(str);
2823 return NULL;
2824 }
2825 Py_DECREF(str);
2826 break;
2827 }
2828
2829 case 'R':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *repr;
2833 assert(obj);
2834 repr = PyObject_Repr(obj);
2835 if (!repr)
2836 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 Py_DECREF(repr);
2839 return NULL;
2840 }
2841 Py_DECREF(repr);
2842 break;
2843 }
2844
2845 case 'A':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 PyObject *ascii;
2849 assert(obj);
2850 ascii = PyObject_ASCII(obj);
2851 if (!ascii)
2852 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002854 Py_DECREF(ascii);
2855 return NULL;
2856 }
2857 Py_DECREF(ascii);
2858 break;
2859 }
2860
2861 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002862 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002863 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002864 break;
2865
2866 default:
2867 /* if we stumble upon an unknown formatting code, copy the rest
2868 of the format string to the output string. (we cannot just
2869 skip the code, since there's no way to know what's in the
2870 argument list) */
2871 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002872 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002873 return NULL;
2874 f = p+len;
2875 return f;
2876 }
2877
2878 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002879 return f;
2880}
2881
Walter Dörwaldd2034312007-05-18 16:29:38 +00002882PyObject *
2883PyUnicode_FromFormatV(const char *format, va_list vargs)
2884{
Victor Stinnere215d962012-10-06 23:03:36 +02002885 va_list vargs2;
2886 const char *f;
2887 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002888
Victor Stinner8f674cc2013-04-17 23:02:17 +02002889 _PyUnicodeWriter_Init(&writer);
2890 writer.min_length = strlen(format) + 100;
2891 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002892
Benjamin Peterson0c212142016-09-20 20:39:33 -07002893 // Copy varags to be able to pass a reference to a subfunction.
2894 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002895
2896 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002897 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002898 f = unicode_fromformat_arg(&writer, f, &vargs2);
2899 if (f == NULL)
2900 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002903 const char *p;
2904 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002905
Victor Stinnere215d962012-10-06 23:03:36 +02002906 p = f;
2907 do
2908 {
2909 if ((unsigned char)*p > 127) {
2910 PyErr_Format(PyExc_ValueError,
2911 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2912 "string, got a non-ASCII byte: 0x%02x",
2913 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002914 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002915 }
2916 p++;
2917 }
2918 while (*p != '\0' && *p != '%');
2919 len = p - f;
2920
2921 if (*p == '\0')
2922 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002923
2924 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002925 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002926
2927 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002930 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002931 return _PyUnicodeWriter_Finish(&writer);
2932
2933 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002934 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002935 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937}
2938
Walter Dörwaldd2034312007-05-18 16:29:38 +00002939PyObject *
2940PyUnicode_FromFormat(const char *format, ...)
2941{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002942 PyObject* ret;
2943 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002944
2945#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002946 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002947#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002948 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002949#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 ret = PyUnicode_FromFormatV(format, vargs);
2951 va_end(vargs);
2952 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002953}
2954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955#ifdef HAVE_WCHAR_H
2956
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002957/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) required to convert the unicode object. Ignore size argument.
2961
Victor Stinnerd88d9832011-09-06 02:00:05 +02002962 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002965Py_ssize_t
2966PyUnicode_AsWideChar(PyObject *unicode,
2967 wchar_t *w,
2968 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002969{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 const wchar_t *wstr;
2972
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002973 if (unicode == NULL) {
2974 PyErr_BadInternalCall();
2975 return -1;
2976 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002977 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002978 if (wstr == NULL)
2979 return -1;
2980
Victor Stinner5593d8a2010-10-02 11:11:27 +00002981 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002982 if (size > res)
2983 size = res + 1;
2984 else
2985 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002986 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002987 return res;
2988 }
2989 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002991}
2992
Victor Stinner137c34c2010-09-29 10:25:54 +00002993wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002994PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002995 Py_ssize_t *size)
2996{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002997 const wchar_t *wstr;
2998 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002999 Py_ssize_t buflen;
3000
3001 if (unicode == NULL) {
3002 PyErr_BadInternalCall();
3003 return NULL;
3004 }
3005
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003006 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3007 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003008 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003009 }
3010 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3011 PyErr_SetString(PyExc_ValueError,
3012 "embedded null character");
3013 return NULL;
3014 }
3015
3016 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003021 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003022 if (size != NULL)
3023 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003024 return buffer;
3025}
3026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003031{
Victor Stinner8faf8212011-12-08 22:14:11 +01003032 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyErr_SetString(PyExc_ValueError,
3034 "chr() arg not in range(0x110000)");
3035 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003036 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003037
Victor Stinner985a82a2014-01-03 12:53:47 +01003038 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003039}
3040
Alexander Belopolsky40018472011-02-26 01:02:56 +00003041PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003042PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003046 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003047 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003048 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 Py_INCREF(obj);
3050 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003051 }
3052 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 /* For a Unicode subtype that's not a Unicode object,
3054 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003055 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003057 PyErr_Format(PyExc_TypeError,
3058 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003059 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003060 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003064PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003067{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003068 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003069 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 PyErr_BadInternalCall();
3073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003075
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 /* Decoding bytes objects is the most common case and should be fast */
3077 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003078 if (PyBytes_GET_SIZE(obj) == 0)
3079 _Py_RETURN_UNICODE_EMPTY();
3080 v = PyUnicode_Decode(
3081 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3082 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003083 return v;
3084 }
3085
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003086 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 PyErr_SetString(PyExc_TypeError,
3088 "decoding str is not supported");
3089 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003090 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003091
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003092 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3093 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3094 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003095 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 Py_TYPE(obj)->tp_name);
3097 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003098 }
Tim Petersced69f82003-09-16 20:30:58 +00003099
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003101 PyBuffer_Release(&buffer);
3102 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003104
Serhiy Storchaka05997252013-01-26 12:14:02 +02003105 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003106 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003107 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108}
3109
Victor Stinnerebe17e02016-10-12 13:57:45 +02003110/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3111 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3112 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003113int
3114_Py_normalize_encoding(const char *encoding,
3115 char *lower,
3116 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003118 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003119 char *l;
3120 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003121 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122
Victor Stinner942889a2016-09-05 15:40:10 -07003123 assert(encoding != NULL);
3124
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003125 e = encoding;
3126 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003127 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003128 punct = 0;
3129 while (1) {
3130 char c = *e;
3131 if (c == 0) {
3132 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003133 }
Victor Stinner942889a2016-09-05 15:40:10 -07003134
3135 if (Py_ISALNUM(c) || c == '.') {
3136 if (punct && l != lower) {
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = '_';
3141 }
3142 punct = 0;
3143
3144 if (l == l_end) {
3145 return 0;
3146 }
3147 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003148 }
3149 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003150 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003151 }
Victor Stinner942889a2016-09-05 15:40:10 -07003152
3153 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003154 }
3155 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003156 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
3160PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 Py_ssize_t size,
3162 const char *encoding,
3163 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003164{
3165 PyObject *buffer = NULL, *unicode;
3166 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003167 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3168
3169 if (encoding == NULL) {
3170 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3171 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003172
Fred Drakee4315f52000-05-09 19:53:39 +00003173 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003174 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3175 char *lower = buflower;
3176
3177 /* Fast paths */
3178 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3179 lower += 3;
3180 if (*lower == '_') {
3181 /* Match "utf8" and "utf_8" */
3182 lower++;
3183 }
3184
3185 if (lower[0] == '8' && lower[1] == 0) {
3186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187 }
3188 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3189 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3190 }
3191 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3192 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3193 }
3194 }
3195 else {
3196 if (strcmp(lower, "ascii") == 0
3197 || strcmp(lower, "us_ascii") == 0) {
3198 return PyUnicode_DecodeASCII(s, size, errors);
3199 }
Steve Dowercc16be82016-09-08 10:35:16 -07003200 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003201 else if (strcmp(lower, "mbcs") == 0) {
3202 return PyUnicode_DecodeMBCS(s, size, errors);
3203 }
3204 #endif
3205 else if (strcmp(lower, "latin1") == 0
3206 || strcmp(lower, "latin_1") == 0
3207 || strcmp(lower, "iso_8859_1") == 0
3208 || strcmp(lower, "iso8859_1") == 0) {
3209 return PyUnicode_DecodeLatin1(s, size, errors);
3210 }
3211 }
Victor Stinner37296e82010-06-10 13:36:23 +00003212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213
3214 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003215 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003216 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003217 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003218 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 if (buffer == NULL)
3220 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003221 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 if (unicode == NULL)
3223 goto onError;
3224 if (!PyUnicode_Check(unicode)) {
3225 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003226 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3227 "use codecs.decode() to decode to arbitrary types",
3228 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003229 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 Py_DECREF(unicode);
3231 goto onError;
3232 }
3233 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003234 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003235
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 Py_XDECREF(buffer);
3238 return NULL;
3239}
3240
Alexander Belopolsky40018472011-02-26 01:02:56 +00003241PyObject *
3242PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003243 const char *encoding,
3244 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246 if (!PyUnicode_Check(unicode)) {
3247 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003248 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249 }
3250
Serhiy Storchaka00939072016-10-27 21:05:49 +03003251 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3252 "PyUnicode_AsDecodedObject() is deprecated; "
3253 "use PyCodec_Decode() to decode from str", 1) < 0)
3254 return NULL;
3255
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003256 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258
3259 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003260 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003261}
3262
Alexander Belopolsky40018472011-02-26 01:02:56 +00003263PyObject *
3264PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003265 const char *encoding,
3266 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003267{
3268 PyObject *v;
3269
3270 if (!PyUnicode_Check(unicode)) {
3271 PyErr_BadArgument();
3272 goto onError;
3273 }
3274
Serhiy Storchaka00939072016-10-27 21:05:49 +03003275 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3276 "PyUnicode_AsDecodedUnicode() is deprecated; "
3277 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3278 return NULL;
3279
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003280 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003282
3283 /* Decode via the codec registry */
3284 v = PyCodec_Decode(unicode, encoding, errors);
3285 if (v == NULL)
3286 goto onError;
3287 if (!PyUnicode_Check(v)) {
3288 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003289 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3290 "use codecs.decode() to decode to arbitrary types",
3291 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003292 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003293 Py_DECREF(v);
3294 goto onError;
3295 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003296 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003299 return NULL;
3300}
3301
Alexander Belopolsky40018472011-02-26 01:02:56 +00003302PyObject *
3303PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003304 Py_ssize_t size,
3305 const char *encoding,
3306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307{
3308 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003309
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003310 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3314 Py_DECREF(unicode);
3315 return v;
3316}
3317
Alexander Belopolsky40018472011-02-26 01:02:56 +00003318PyObject *
3319PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003320 const char *encoding,
3321 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003322{
3323 PyObject *v;
3324
3325 if (!PyUnicode_Check(unicode)) {
3326 PyErr_BadArgument();
3327 goto onError;
3328 }
3329
Serhiy Storchaka00939072016-10-27 21:05:49 +03003330 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3331 "PyUnicode_AsEncodedObject() is deprecated; "
3332 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3333 "or PyCodec_Encode() for generic encoding", 1) < 0)
3334 return NULL;
3335
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003336 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003338
3339 /* Encode via the codec registry */
3340 v = PyCodec_Encode(unicode, encoding, errors);
3341 if (v == NULL)
3342 goto onError;
3343 return v;
3344
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 return NULL;
3347}
3348
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349static size_t
3350wcstombs_errorpos(const wchar_t *wstr)
3351{
3352 size_t len;
3353#if SIZEOF_WCHAR_T == 2
3354 wchar_t buf[3];
3355#else
3356 wchar_t buf[2];
3357#endif
3358 char outbuf[MB_LEN_MAX];
3359 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003360
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361#if SIZEOF_WCHAR_T == 2
3362 buf[2] = 0;
3363#else
3364 buf[1] = 0;
3365#endif
3366 start = wstr;
3367 while (*wstr != L'\0')
3368 {
3369 previous = wstr;
3370#if SIZEOF_WCHAR_T == 2
3371 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3372 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3373 {
3374 buf[0] = wstr[0];
3375 buf[1] = wstr[1];
3376 wstr += 2;
3377 }
3378 else {
3379 buf[0] = *wstr;
3380 buf[1] = 0;
3381 wstr++;
3382 }
3383#else
3384 buf[0] = *wstr;
3385 wstr++;
3386#endif
3387 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003388 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003389 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 }
3391
3392 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return 0;
3394}
3395
Victor Stinner1b579672011-12-17 05:47:23 +01003396static int
3397locale_error_handler(const char *errors, int *surrogateescape)
3398{
Victor Stinner50149202015-09-22 00:26:54 +02003399 _Py_error_handler error_handler = get_error_handler(errors);
3400 switch (error_handler)
3401 {
3402 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003403 *surrogateescape = 0;
3404 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003405 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003406 *surrogateescape = 1;
3407 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003408 default:
3409 PyErr_Format(PyExc_ValueError,
3410 "only 'strict' and 'surrogateescape' error handlers "
3411 "are supported, not '%s'",
3412 errors);
3413 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003414 }
Victor Stinner1b579672011-12-17 05:47:23 +01003415}
3416
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003418PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419{
3420 Py_ssize_t wlen, wlen2;
3421 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003423 PyObject *bytes, *reason, *exc;
3424 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003425 int surrogateescape;
3426
3427 if (locale_error_handler(errors, &surrogateescape) < 0)
3428 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003429
3430 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3431 if (wstr == NULL)
3432 return NULL;
3433
3434 wlen2 = wcslen(wstr);
3435 if (wlen2 != wlen) {
3436 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003437 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003438 return NULL;
3439 }
3440
3441 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003442 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443 char *str;
3444
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003445 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003446 if (str == NULL) {
3447 if (error_pos == (size_t)-1) {
3448 PyErr_NoMemory();
3449 PyMem_Free(wstr);
3450 return NULL;
3451 }
3452 else {
3453 goto encode_error;
3454 }
3455 }
3456 PyMem_Free(wstr);
3457
3458 bytes = PyBytes_FromString(str);
3459 PyMem_Free(str);
3460 }
3461 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003462 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003463 size_t len, len2;
3464
3465 len = wcstombs(NULL, wstr, 0);
3466 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003467 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003468 goto encode_error;
3469 }
3470
3471 bytes = PyBytes_FromStringAndSize(NULL, len);
3472 if (bytes == NULL) {
3473 PyMem_Free(wstr);
3474 return NULL;
3475 }
3476
3477 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3478 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003479 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003480 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003481 goto encode_error;
3482 }
3483 PyMem_Free(wstr);
3484 }
3485 return bytes;
3486
3487encode_error:
3488 errmsg = strerror(errno);
3489 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003490
3491 if (error_pos == (size_t)-1)
3492 error_pos = wcstombs_errorpos(wstr);
3493
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003494 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003496 wstr = Py_DecodeLocale(errmsg, &errlen);
3497 if (wstr != NULL) {
3498 reason = PyUnicode_FromWideChar(wstr, errlen);
3499 PyMem_RawFree(wstr);
3500 } else {
3501 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003502 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003503
Victor Stinner2f197072011-12-17 07:08:30 +01003504 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003505 reason = PyUnicode_FromString(
3506 "wcstombs() encountered an unencodable "
3507 "wide character");
3508 if (reason == NULL)
3509 return NULL;
3510
3511 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3512 "locale", unicode,
3513 (Py_ssize_t)error_pos,
3514 (Py_ssize_t)(error_pos+1),
3515 reason);
3516 Py_DECREF(reason);
3517 if (exc != NULL) {
3518 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003519 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003520 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003521 return NULL;
3522}
3523
Victor Stinnerad158722010-10-27 00:25:46 +00003524PyObject *
3525PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003526{
Steve Dowercc16be82016-09-08 10:35:16 -07003527#if defined(__APPLE__)
3528 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003529#else
Victor Stinner793b5312011-04-27 00:24:21 +02003530 PyInterpreterState *interp = PyThreadState_GET()->interp;
3531 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3532 cannot use it to encode and decode filenames before it is loaded. Load
3533 the Python codec requires to encode at least its own filename. Use the C
3534 version of the locale codec until the codec registry is initialized and
3535 the Python codec is loaded.
3536
3537 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3538 cannot only rely on it: check also interp->fscodec_initialized for
3539 subinterpreters. */
3540 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003541 return PyUnicode_AsEncodedString(unicode,
3542 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003543 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003544 }
3545 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003546 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003547 }
Victor Stinnerad158722010-10-27 00:25:46 +00003548#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003549}
3550
Alexander Belopolsky40018472011-02-26 01:02:56 +00003551PyObject *
3552PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003553 const char *encoding,
3554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
3556 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003557 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003558
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 }
Fred Drakee4315f52000-05-09 19:53:39 +00003563
Victor Stinner942889a2016-09-05 15:40:10 -07003564 if (encoding == NULL) {
3565 return _PyUnicode_AsUTF8String(unicode, errors);
3566 }
3567
Fred Drakee4315f52000-05-09 19:53:39 +00003568 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003569 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3570 char *lower = buflower;
3571
3572 /* Fast paths */
3573 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3574 lower += 3;
3575 if (*lower == '_') {
3576 /* Match "utf8" and "utf_8" */
3577 lower++;
3578 }
3579
3580 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003581 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003582 }
3583 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3584 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3585 }
3586 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3587 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3588 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003589 }
Victor Stinner942889a2016-09-05 15:40:10 -07003590 else {
3591 if (strcmp(lower, "ascii") == 0
3592 || strcmp(lower, "us_ascii") == 0) {
3593 return _PyUnicode_AsASCIIString(unicode, errors);
3594 }
Steve Dowercc16be82016-09-08 10:35:16 -07003595#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003596 else if (strcmp(lower, "mbcs") == 0) {
3597 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3598 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003599#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003600 else if (strcmp(lower, "latin1") == 0 ||
3601 strcmp(lower, "latin_1") == 0 ||
3602 strcmp(lower, "iso_8859_1") == 0 ||
3603 strcmp(lower, "iso8859_1") == 0) {
3604 return _PyUnicode_AsLatin1String(unicode, errors);
3605 }
3606 }
Victor Stinner37296e82010-06-10 13:36:23 +00003607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608
3609 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003610 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003612 return NULL;
3613
3614 /* The normal path */
3615 if (PyBytes_Check(v))
3616 return v;
3617
3618 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003619 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003620 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003621 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003622
3623 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003624 "encoder %s returned bytearray instead of bytes; "
3625 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003626 encoding);
3627 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003628 Py_DECREF(v);
3629 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003631
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003632 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3633 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003634 Py_DECREF(v);
3635 return b;
3636 }
3637
3638 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003639 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3640 "use codecs.encode() to encode to arbitrary types",
3641 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003642 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003643 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644 return NULL;
3645}
3646
Alexander Belopolsky40018472011-02-26 01:02:56 +00003647PyObject *
3648PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003649 const char *encoding,
3650 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003651{
3652 PyObject *v;
3653
3654 if (!PyUnicode_Check(unicode)) {
3655 PyErr_BadArgument();
3656 goto onError;
3657 }
3658
Serhiy Storchaka00939072016-10-27 21:05:49 +03003659 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3660 "PyUnicode_AsEncodedUnicode() is deprecated; "
3661 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3662 return NULL;
3663
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003664 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003666
3667 /* Encode via the codec registry */
3668 v = PyCodec_Encode(unicode, encoding, errors);
3669 if (v == NULL)
3670 goto onError;
3671 if (!PyUnicode_Check(v)) {
3672 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003673 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3674 "use codecs.encode() to encode to arbitrary types",
3675 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003676 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003677 Py_DECREF(v);
3678 goto onError;
3679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003681
Benjamin Peterson29060642009-01-31 22:14:21 +00003682 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return NULL;
3684}
3685
Victor Stinner2f197072011-12-17 07:08:30 +01003686static size_t
3687mbstowcs_errorpos(const char *str, size_t len)
3688{
3689#ifdef HAVE_MBRTOWC
3690 const char *start = str;
3691 mbstate_t mbs;
3692 size_t converted;
3693 wchar_t ch;
3694
3695 memset(&mbs, 0, sizeof mbs);
3696 while (len)
3697 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003698 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003699 if (converted == 0)
3700 /* Reached end of string */
3701 break;
3702 if (converted == (size_t)-1 || converted == (size_t)-2) {
3703 /* Conversion error or incomplete character */
3704 return str - start;
3705 }
3706 else {
3707 str += converted;
3708 len -= converted;
3709 }
3710 }
3711 /* failed to find the undecodable byte sequence */
3712 return 0;
3713#endif
3714 return 0;
3715}
3716
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003717PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003718PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003719 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720{
3721 wchar_t smallbuf[256];
3722 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3723 wchar_t *wstr;
3724 size_t wlen, wlen2;
3725 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003726 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003727 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003728 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003729 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003730
3731 if (locale_error_handler(errors, &surrogateescape) < 0)
3732 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003733
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003734 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3735 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003736 return NULL;
3737 }
3738
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003739 if (surrogateescape) {
3740 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003741 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003742 if (wstr == NULL) {
3743 if (wlen == (size_t)-1)
3744 PyErr_NoMemory();
3745 else
3746 PyErr_SetFromErrno(PyExc_OSError);
3747 return NULL;
3748 }
3749
3750 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003751 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003752 }
3753 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003754 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755#ifndef HAVE_BROKEN_MBSTOWCS
3756 wlen = mbstowcs(NULL, str, 0);
3757#else
3758 wlen = len;
3759#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003760 if (wlen == (size_t)-1)
3761 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003762 if (wlen+1 <= smallbuf_len) {
3763 wstr = smallbuf;
3764 }
3765 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003766 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767 if (!wstr)
3768 return PyErr_NoMemory();
3769 }
3770
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003771 wlen2 = mbstowcs(wstr, str, wlen+1);
3772 if (wlen2 == (size_t)-1) {
3773 if (wstr != smallbuf)
3774 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003775 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003776 }
3777#ifdef HAVE_BROKEN_MBSTOWCS
3778 assert(wlen2 == wlen);
3779#endif
3780 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3781 if (wstr != smallbuf)
3782 PyMem_Free(wstr);
3783 }
3784 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003785
3786decode_error:
3787 errmsg = strerror(errno);
3788 assert(errmsg != NULL);
3789
3790 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003791 wstr = Py_DecodeLocale(errmsg, &errlen);
3792 if (wstr != NULL) {
3793 reason = PyUnicode_FromWideChar(wstr, errlen);
3794 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003795 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003796
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003797 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003798 reason = PyUnicode_FromString(
3799 "mbstowcs() encountered an invalid multibyte sequence");
3800 if (reason == NULL)
3801 return NULL;
3802
3803 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3804 "locale", str, len,
3805 (Py_ssize_t)error_pos,
3806 (Py_ssize_t)(error_pos+1),
3807 reason);
3808 Py_DECREF(reason);
3809 if (exc != NULL) {
3810 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003811 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003812 }
3813 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003814}
3815
3816PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003817PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003818{
3819 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003820 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003821}
3822
3823
3824PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003825PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003826 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003827 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3828}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003829
Christian Heimes5894ba72007-11-04 11:43:14 +00003830PyObject*
3831PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3832{
Steve Dowercc16be82016-09-08 10:35:16 -07003833#if defined(__APPLE__)
3834 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003835#else
Victor Stinner793b5312011-04-27 00:24:21 +02003836 PyInterpreterState *interp = PyThreadState_GET()->interp;
3837 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3838 cannot use it to encode and decode filenames before it is loaded. Load
3839 the Python codec requires to encode at least its own filename. Use the C
3840 version of the locale codec until the codec registry is initialized and
3841 the Python codec is loaded.
3842
3843 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3844 cannot only rely on it: check also interp->fscodec_initialized for
3845 subinterpreters. */
3846 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003847 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003849 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003850 }
3851 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003852 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003853 }
Victor Stinnerad158722010-10-27 00:25:46 +00003854#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003855}
3856
Martin v. Löwis011e8422009-05-05 04:43:17 +00003857
3858int
3859PyUnicode_FSConverter(PyObject* arg, void* addr)
3860{
Brett Cannonec6ce872016-09-06 15:50:29 -07003861 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003862 PyObject *output = NULL;
3863 Py_ssize_t size;
3864 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003865 if (arg == NULL) {
3866 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003867 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003868 return 1;
3869 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003870 path = PyOS_FSPath(arg);
3871 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003872 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003873 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003874 if (PyBytes_Check(path)) {
3875 output = path;
3876 }
3877 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3878 output = PyUnicode_EncodeFSDefault(path);
3879 Py_DECREF(path);
3880 if (!output) {
3881 return 0;
3882 }
3883 assert(PyBytes_Check(output));
3884 }
3885
Victor Stinner0ea2a462010-04-30 00:22:08 +00003886 size = PyBytes_GET_SIZE(output);
3887 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003888 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003889 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003890 Py_DECREF(output);
3891 return 0;
3892 }
3893 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003894 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003895}
3896
3897
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003898int
3899PyUnicode_FSDecoder(PyObject* arg, void* addr)
3900{
Brett Cannona5711202016-09-06 19:36:01 -07003901 int is_buffer = 0;
3902 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003904 if (arg == NULL) {
3905 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003906 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 return 1;
3908 }
Brett Cannona5711202016-09-06 19:36:01 -07003909
3910 is_buffer = PyObject_CheckBuffer(arg);
3911 if (!is_buffer) {
3912 path = PyOS_FSPath(arg);
3913 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003914 return 0;
3915 }
Brett Cannona5711202016-09-06 19:36:01 -07003916 }
3917 else {
3918 path = arg;
3919 Py_INCREF(arg);
3920 }
3921
3922 if (PyUnicode_Check(path)) {
3923 if (PyUnicode_READY(path) == -1) {
3924 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003925 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003926 }
3927 output = path;
3928 }
3929 else if (PyBytes_Check(path) || is_buffer) {
3930 PyObject *path_bytes = NULL;
3931
3932 if (!PyBytes_Check(path) &&
3933 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3934 "path should be string, bytes, or os.PathLike, not %.200s",
3935 Py_TYPE(arg)->tp_name)) {
3936 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003937 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003938 }
3939 path_bytes = PyBytes_FromObject(path);
3940 Py_DECREF(path);
3941 if (!path_bytes) {
3942 return 0;
3943 }
3944 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3945 PyBytes_GET_SIZE(path_bytes));
3946 Py_DECREF(path_bytes);
3947 if (!output) {
3948 return 0;
3949 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003950 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003951 else {
3952 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003953 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003954 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003955 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003956 return 0;
3957 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003958 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003959 Py_DECREF(output);
3960 return 0;
3961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003963 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003964 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003965 Py_DECREF(output);
3966 return 0;
3967 }
3968 *(PyObject**)addr = output;
3969 return Py_CLEANUP_SUPPORTED;
3970}
3971
3972
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003973const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003975{
Christian Heimesf3863112007-11-22 07:46:41 +00003976 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003978 if (!PyUnicode_Check(unicode)) {
3979 PyErr_BadArgument();
3980 return NULL;
3981 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003983 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003985 if (PyUnicode_UTF8(unicode) == NULL) {
3986 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003987 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 if (bytes == NULL)
3989 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003990 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3991 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003992 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 Py_DECREF(bytes);
3994 return NULL;
3995 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003997 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 PyBytes_AS_STRING(bytes),
3999 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 Py_DECREF(bytes);
4001 }
4002
4003 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 *psize = PyUnicode_UTF8_LENGTH(unicode);
4005 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004006}
4007
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004008const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4012}
4013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014Py_UNICODE *
4015PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 const unsigned char *one_byte;
4018#if SIZEOF_WCHAR_T == 4
4019 const Py_UCS2 *two_bytes;
4020#else
4021 const Py_UCS4 *four_bytes;
4022 const Py_UCS4 *ucs4_end;
4023 Py_ssize_t num_surrogates;
4024#endif
4025 wchar_t *w;
4026 wchar_t *wchar_end;
4027
4028 if (!PyUnicode_Check(unicode)) {
4029 PyErr_BadArgument();
4030 return NULL;
4031 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004032 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 assert(_PyUnicode_KIND(unicode) != 0);
4035 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004037 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4040 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 num_surrogates = 0;
4042
4043 for (; four_bytes < ucs4_end; ++four_bytes) {
4044 if (*four_bytes > 0xFFFF)
4045 ++num_surrogates;
4046 }
4047
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004048 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4049 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4050 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 PyErr_NoMemory();
4052 return NULL;
4053 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004054 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 w = _PyUnicode_WSTR(unicode);
4057 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4058 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4060 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004061 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004063 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4064 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 }
4066 else
4067 *w = *four_bytes;
4068
4069 if (w > wchar_end) {
4070 assert(0 && "Miscalculated string end");
4071 }
4072 }
4073 *w = 0;
4074#else
4075 /* sizeof(wchar_t) == 4 */
4076 Py_FatalError("Impossible unicode object state, wstr and str "
4077 "should share memory already.");
4078 return NULL;
4079#endif
4080 }
4081 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004082 if ((size_t)_PyUnicode_LENGTH(unicode) >
4083 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4084 PyErr_NoMemory();
4085 return NULL;
4086 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004087 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4088 (_PyUnicode_LENGTH(unicode) + 1));
4089 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 PyErr_NoMemory();
4091 return NULL;
4092 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004093 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4094 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4095 w = _PyUnicode_WSTR(unicode);
4096 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004098 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4099 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 for (; w < wchar_end; ++one_byte, ++w)
4101 *w = *one_byte;
4102 /* null-terminate the wstr */
4103 *w = 0;
4104 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004105 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004107 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 for (; w < wchar_end; ++two_bytes, ++w)
4109 *w = *two_bytes;
4110 /* null-terminate the wstr */
4111 *w = 0;
4112#else
4113 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004114 PyObject_FREE(_PyUnicode_WSTR(unicode));
4115 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 Py_FatalError("Impossible unicode object state, wstr "
4117 "and str should share memory already.");
4118 return NULL;
4119#endif
4120 }
4121 else {
4122 assert(0 && "This should never happen.");
4123 }
4124 }
4125 }
4126 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004127 *size = PyUnicode_WSTR_LENGTH(unicode);
4128 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004129}
4130
Alexander Belopolsky40018472011-02-26 01:02:56 +00004131Py_UNICODE *
4132PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135}
4136
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004137const Py_UNICODE *
4138_PyUnicode_AsUnicode(PyObject *unicode)
4139{
4140 Py_ssize_t size;
4141 const Py_UNICODE *wstr;
4142
4143 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4144 if (wstr && wcslen(wstr) != (size_t)size) {
4145 PyErr_SetString(PyExc_ValueError, "embedded null character");
4146 return NULL;
4147 }
4148 return wstr;
4149}
4150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151
Alexander Belopolsky40018472011-02-26 01:02:56 +00004152Py_ssize_t
4153PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154{
4155 if (!PyUnicode_Check(unicode)) {
4156 PyErr_BadArgument();
4157 goto onError;
4158 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004159 if (_PyUnicode_WSTR(unicode) == NULL) {
4160 if (PyUnicode_AsUnicode(unicode) == NULL)
4161 goto onError;
4162 }
4163 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 return -1;
4167}
4168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169Py_ssize_t
4170PyUnicode_GetLength(PyObject *unicode)
4171{
Victor Stinner07621332012-06-16 04:53:46 +02004172 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 PyErr_BadArgument();
4174 return -1;
4175 }
Victor Stinner07621332012-06-16 04:53:46 +02004176 if (PyUnicode_READY(unicode) == -1)
4177 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 return PyUnicode_GET_LENGTH(unicode);
4179}
4180
4181Py_UCS4
4182PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4183{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004184 void *data;
4185 int kind;
4186
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004187 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4188 PyErr_BadArgument();
4189 return (Py_UCS4)-1;
4190 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004191 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004192 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return (Py_UCS4)-1;
4194 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004195 data = PyUnicode_DATA(unicode);
4196 kind = PyUnicode_KIND(unicode);
4197 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198}
4199
4200int
4201PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4202{
4203 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004204 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 return -1;
4206 }
Victor Stinner488fa492011-12-12 00:01:39 +01004207 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004208 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004209 PyErr_SetString(PyExc_IndexError, "string index out of range");
4210 return -1;
4211 }
Victor Stinner488fa492011-12-12 00:01:39 +01004212 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004213 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004214 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4215 PyErr_SetString(PyExc_ValueError, "character out of range");
4216 return -1;
4217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4219 index, ch);
4220 return 0;
4221}
4222
Alexander Belopolsky40018472011-02-26 01:02:56 +00004223const char *
4224PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004225{
Victor Stinner42cb4622010-09-01 19:39:01 +00004226 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004227}
4228
Victor Stinner554f3f02010-06-16 23:33:54 +00004229/* create or adjust a UnicodeDecodeError */
4230static void
4231make_decode_exception(PyObject **exceptionObject,
4232 const char *encoding,
4233 const char *input, Py_ssize_t length,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
4235 const char *reason)
4236{
4237 if (*exceptionObject == NULL) {
4238 *exceptionObject = PyUnicodeDecodeError_Create(
4239 encoding, input, length, startpos, endpos, reason);
4240 }
4241 else {
4242 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4243 goto onError;
4244 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4245 goto onError;
4246 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4247 goto onError;
4248 }
4249 return;
4250
4251onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004252 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004253}
4254
Steve Dowercc16be82016-09-08 10:35:16 -07004255#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256/* error handling callback helper:
4257 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004258 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 and adjust various state variables.
4260 return 0 on success, -1 on error
4261*/
4262
Alexander Belopolsky40018472011-02-26 01:02:56 +00004263static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264unicode_decode_call_errorhandler_wchar(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004275 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t requiredsize;
4278 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004279 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 wchar_t *repwstr;
4281 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4284 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 *errorHandler = PyCodec_LookupError(errors);
4288 if (*errorHandler == NULL)
4289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 }
4291
Victor Stinner554f3f02010-06-16 23:33:54 +00004292 make_decode_exception(exceptionObject,
4293 encoding,
4294 *input, *inend - *input,
4295 *startinpos, *endinpos,
4296 reason);
4297 if (*exceptionObject == NULL)
4298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004300 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004304 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004307 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309
4310 /* Copy back the bytes variables, which might have been modified by the
4311 callback */
4312 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313 if (!inputobj)
4314 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 *input = PyBytes_AS_STRING(inputobj);
4316 insize = PyBytes_GET_SIZE(inputobj);
4317 *inend = *input + insize;
4318 /* we can DECREF safely, as the exception has another reference,
4319 so the object won't go away. */
4320 Py_DECREF(inputobj);
4321
4322 if (newpos<0)
4323 newpos = insize+newpos;
4324 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004325 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 goto onError;
4327 }
4328
4329 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4330 if (repwstr == NULL)
4331 goto onError;
4332 /* need more space? (at least enough for what we
4333 have+the replacement+the rest of the string (starting
4334 at the new input position), so we won't have to check space
4335 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 requiredsize = *outpos;
4337 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4338 goto overflow;
4339 requiredsize += repwlen;
4340 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4341 goto overflow;
4342 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004344 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 requiredsize = 2*outsize;
4346 if (unicode_resize(output, requiredsize) < 0)
4347 goto onError;
4348 }
4349 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4350 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 *endinpos = newpos;
4352 *inptr = *input + newpos;
4353
4354 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004355 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return 0;
4357
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004358 overflow:
4359 PyErr_SetString(PyExc_OverflowError,
4360 "decoded result is too long for a Python string");
4361
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 onError:
4363 Py_XDECREF(restuple);
4364 return -1;
4365}
Steve Dowercc16be82016-09-08 10:35:16 -07004366#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367
4368static int
4369unicode_decode_call_errorhandler_writer(
4370 const char *errors, PyObject **errorHandler,
4371 const char *encoding, const char *reason,
4372 const char **input, const char **inend, Py_ssize_t *startinpos,
4373 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4374 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4375{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004376 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377
4378 PyObject *restuple = NULL;
4379 PyObject *repunicode = NULL;
4380 Py_ssize_t insize;
4381 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004382 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004383 PyObject *inputobj = NULL;
4384
4385 if (*errorHandler == NULL) {
4386 *errorHandler = PyCodec_LookupError(errors);
4387 if (*errorHandler == NULL)
4388 goto onError;
4389 }
4390
4391 make_decode_exception(exceptionObject,
4392 encoding,
4393 *input, *inend - *input,
4394 *startinpos, *endinpos,
4395 reason);
4396 if (*exceptionObject == NULL)
4397 goto onError;
4398
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004399 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 if (restuple == NULL)
4401 goto onError;
4402 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 goto onError;
4405 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004406 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
4409 /* Copy back the bytes variables, which might have been modified by the
4410 callback */
4411 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412 if (!inputobj)
4413 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004414 *input = PyBytes_AS_STRING(inputobj);
4415 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004416 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004417 /* we can DECREF safely, as the exception has another reference,
4418 so the object won't go away. */
4419 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004423 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004424 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427
Victor Stinner170ca6f2013-04-18 00:25:28 +02004428 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004429 if (replen > 1) {
4430 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004431 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004432 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4433 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4434 goto onError;
4435 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004437 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004443 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449}
4450
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451/* --- UTF-7 Codec -------------------------------------------------------- */
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453/* See RFC2152 for details. We encode conservatively and decode liberally. */
4454
4455/* Three simple macros defining base-64. */
4456
4457/* Is c a base-64 character? */
4458
4459#define IS_BASE64(c) \
4460 (((c) >= 'A' && (c) <= 'Z') || \
4461 ((c) >= 'a' && (c) <= 'z') || \
4462 ((c) >= '0' && (c) <= '9') || \
4463 (c) == '+' || (c) == '/')
4464
4465/* given that c is a base-64 character, what is its base-64 value? */
4466
4467#define FROM_BASE64(c) \
4468 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4469 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4470 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4471 (c) == '+' ? 62 : 63)
4472
4473/* What is the base-64 character of the bottom 6 bits of n? */
4474
4475#define TO_BASE64(n) \
4476 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4477
4478/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4479 * decoded as itself. We are permissive on decoding; the only ASCII
4480 * byte not decoding to itself is the + which begins a base64
4481 * string. */
4482
4483#define DECODE_DIRECT(c) \
4484 ((c) <= 127 && (c) != '+')
4485
4486/* The UTF-7 encoder treats ASCII characters differently according to
4487 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4488 * the above). See RFC2152. This array identifies these different
4489 * sets:
4490 * 0 : "Set D"
4491 * alphanumeric and '(),-./:?
4492 * 1 : "Set O"
4493 * !"#$%&*;<=>@[]^_`{|}
4494 * 2 : "whitespace"
4495 * ht nl cr sp
4496 * 3 : special (must be base64 encoded)
4497 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4498 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499
Tim Petersced69f82003-09-16 20:30:58 +00004500static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501char utf7_category[128] = {
4502/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4503 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4504/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4505 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4506/* sp ! " # $ % & ' ( ) * + , - . / */
4507 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4508/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4510/* @ A B C D E F G H I J K L M N O */
4511 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4512/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4514/* ` a b c d e f g h i j k l m n o */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* p q r s t u v w x y z { | } ~ del */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518};
4519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520/* ENCODE_DIRECT: this character should be encoded as itself. The
4521 * answer depends on whether we are encoding set O as itself, and also
4522 * on whether we are encoding whitespace as itself. RFC2152 makes it
4523 * clear that the answers to these questions vary between
4524 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526#define ENCODE_DIRECT(c, directO, directWS) \
4527 ((c) < 128 && (c) > 0 && \
4528 ((utf7_category[(c)] == 0) || \
4529 (directWS && (utf7_category[(c)] == 2)) || \
4530 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
4533PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004534 Py_ssize_t size,
4535 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4538}
4539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540/* The decoder. The only state we preserve is our read position,
4541 * i.e. how many characters we have consumed. So if we end in the
4542 * middle of a shift sequence we have to back off the read position
4543 * and the output to the beginning of the sequence, otherwise we lose
4544 * all the shift state (seen bits, number of bits seen, high
4545 * surrogate). */
4546
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547PyObject *
4548PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004549 Py_ssize_t size,
4550 const char *errors,
4551 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t startinpos;
4555 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 const char *errmsg = "";
4559 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004560 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 unsigned int base64bits = 0;
4562 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004563 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004567 if (size == 0) {
4568 if (consumed)
4569 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004570 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004571 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004574 _PyUnicodeWriter_Init(&writer);
4575 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004576
4577 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 e = s + size;
4579
4580 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004581 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004583 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (inShift) { /* in a base-64 section */
4586 if (IS_BASE64(ch)) { /* consume a base-64 character */
4587 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4588 base64bits += 6;
4589 s++;
4590 if (base64bits >= 16) {
4591 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004592 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 base64bits -= 16;
4594 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004595 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 if (surrogate) {
4597 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004598 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4599 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004600 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004603 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 }
4605 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004606 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004607 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
4610 }
Victor Stinner551ac952011-11-29 22:58:13 +01004611 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 /* first surrogate */
4613 surrogate = outCh;
4614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004616 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 }
4619 }
4620 }
4621 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 if (base64bits > 0) { /* left-over bits */
4624 if (base64bits >= 6) {
4625 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004626 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 errmsg = "partial character in shift sequence";
4628 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 else {
4631 /* Some bits remain; they should be zero */
4632 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004633 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 errmsg = "non-zero padding bits in shift sequence";
4635 goto utf7Error;
4636 }
4637 }
4638 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004639 if (surrogate && DECODE_DIRECT(ch)) {
4640 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4641 goto onError;
4642 }
4643 surrogate = 0;
4644 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 /* '-' is absorbed; other terminating
4646 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004647 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
4650 }
4651 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 s++; /* consume '+' */
4654 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004656 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004657 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 }
4659 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004661 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004664 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 }
4666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004669 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 else {
4673 startinpos = s-starts;
4674 s++;
4675 errmsg = "unexpected special character";
4676 goto utf7Error;
4677 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004681 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 errors, &errorHandler,
4683 "utf7", errmsg,
4684 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 }
4688
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 /* end of string */
4690
4691 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4692 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004693 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 if (surrogate ||
4695 (base64bits >= 6) ||
4696 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004698 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 errors, &errorHandler,
4700 "utf7", "unterminated shift sequence",
4701 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004702 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 goto onError;
4704 if (s < e)
4705 goto restart;
4706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708
4709 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004710 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004713 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004714 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004715 writer.kind, writer.data, shiftOutStart);
4716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
4718 _PyUnicodeWriter_Dealloc(&writer);
4719 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004720 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004721 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 }
4723 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004724 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004726 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 return NULL;
4737}
4738
4739
Alexander Belopolsky40018472011-02-26 01:02:56 +00004740PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741_PyUnicode_EncodeUTF7(PyObject *str,
4742 int base64SetO,
4743 int base64WhiteSpace,
4744 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 int kind;
4747 void *data;
4748 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004749 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 unsigned int base64bits = 0;
4753 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 char * out;
4755 char * start;
4756
Benjamin Petersonbac79492012-01-14 13:34:47 -05004757 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758 return NULL;
4759 kind = PyUnicode_KIND(str);
4760 data = PyUnicode_DATA(str);
4761 len = PyUnicode_GET_LENGTH(str);
4762
4763 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004767 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004768 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004769 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 if (v == NULL)
4771 return NULL;
4772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004774 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Antoine Pitrou244651a2009-05-04 18:56:13 +00004777 if (inShift) {
4778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 /* shifting out */
4780 if (base64bits) { /* output remaining bits */
4781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4782 base64buffer = 0;
4783 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
4785 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 /* Characters not in the BASE64 set implicitly unshift the sequence
4787 so no '-' is required, except if the character is itself a '-' */
4788 if (IS_BASE64(ch) || ch == '-') {
4789 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 *out++ = (char) ch;
4792 }
4793 else {
4794 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004795 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 else { /* not in a shift sequence */
4798 if (ch == '+') {
4799 *out++ = '+';
4800 *out++ = '-';
4801 }
4802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4803 *out++ = (char) ch;
4804 }
4805 else {
4806 *out++ = '+';
4807 inShift = 1;
4808 goto encode_char;
4809 }
4810 }
4811 continue;
4812encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004814 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004815
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* code first surrogate */
4817 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004818 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 while (base64bits >= 6) {
4820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4821 base64bits -= 6;
4822 }
4823 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004824 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 base64bits += 16;
4827 base64buffer = (base64buffer << 16) | ch;
4828 while (base64bits >= 6) {
4829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4830 base64bits -= 6;
4831 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004832 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 if (base64bits)
4834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4835 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004836 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004837 if (_PyBytes_Resize(&v, out - start) < 0)
4838 return NULL;
4839 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841PyObject *
4842PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4843 Py_ssize_t size,
4844 int base64SetO,
4845 int base64WhiteSpace,
4846 const char *errors)
4847{
4848 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004849 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 if (tmp == NULL)
4851 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004852 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004853 base64WhiteSpace, errors);
4854 Py_DECREF(tmp);
4855 return result;
4856}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004857
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858#undef IS_BASE64
4859#undef FROM_BASE64
4860#undef TO_BASE64
4861#undef DECODE_DIRECT
4862#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864/* --- UTF-8 Codec -------------------------------------------------------- */
4865
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyObject *
4867PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald69652032004-09-07 20:24:22 +00004871 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4872}
4873
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874#include "stringlib/asciilib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004878#include "stringlib/ucs1lib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
4882#include "stringlib/ucs2lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs4lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
Antoine Pitrouab868312009-01-10 15:40:25 +00004890/* Mask to quickly check whether a C 'long' contains a
4891 non-ASCII, UTF8-encoded char. */
4892#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004893# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004894#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004895# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004896#else
4897# error C 'long' size should be either 4 or 8!
4898#endif
4899
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900static Py_ssize_t
4901ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004904 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004906 /*
4907 * Issue #17237: m68k is a bit different from most architectures in
4908 * that objects do not use "natural alignment" - for example, int and
4909 * long are only aligned at 2-byte boundaries. Therefore the assert()
4910 * won't work; also, tests have shown that skipping the "optimised
4911 * version" will even speed up m68k.
4912 */
4913#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004915 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4916 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 /* Fast path, see in STRINGLIB(utf8_decode) for
4918 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
4921 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (_p < aligned_end) {
4923 unsigned long value = *(const unsigned long *) _p;
4924 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 *((unsigned long *)q) = value;
4927 _p += SIZEOF_LONG;
4928 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004929 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 p = _p;
4931 while (p < end) {
4932 if ((unsigned char)*p & 0x80)
4933 break;
4934 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004939#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 while (p < end) {
4941 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4942 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004943 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004944 /* Help allocation */
4945 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 while (_p < aligned_end) {
4947 unsigned long value = *(unsigned long *) _p;
4948 if (value & ASCII_CHAR_MASK)
4949 break;
4950 _p += SIZEOF_LONG;
4951 }
4952 p = _p;
4953 if (_p == end)
4954 break;
4955 }
4956 if ((unsigned char)*p & 0x80)
4957 break;
4958 ++p;
4959 }
4960 memcpy(dest, start, p - start);
4961 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962}
Antoine Pitrouab868312009-01-10 15:40:25 +00004963
Victor Stinner785938e2011-12-11 20:09:03 +01004964PyObject *
4965PyUnicode_DecodeUTF8Stateful(const char *s,
4966 Py_ssize_t size,
4967 const char *errors,
4968 Py_ssize_t *consumed)
4969{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004970 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004971 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973
4974 Py_ssize_t startinpos;
4975 Py_ssize_t endinpos;
4976 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004977 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004979 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004980
4981 if (size == 0) {
4982 if (consumed)
4983 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004984 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004985 }
4986
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4988 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004989 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 *consumed = 1;
4991 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004992 }
4993
Victor Stinner8f674cc2013-04-17 23:02:17 +02004994 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004995 writer.min_length = size;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004998
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 writer.pos = ascii_decode(s, end, writer.data);
5000 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 while (s < end) {
5002 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005004
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 if (PyUnicode_IS_ASCII(writer.buffer))
5007 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 } else {
5013 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 }
5016
5017 switch (ch) {
5018 case 0:
5019 if (s == end || consumed)
5020 goto End;
5021 errmsg = "unexpected end of data";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 case 1:
5026 errmsg = "invalid start byte";
5027 startinpos = s - starts;
5028 endinpos = startinpos + 1;
5029 break;
5030 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005031 case 3:
5032 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 errmsg = "invalid continuation byte";
5034 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005035 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 break;
5037 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005038 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 goto onError;
5040 continue;
5041 }
5042
Victor Stinner1d65d912015-10-05 13:43:50 +02005043 if (error_handler == _Py_ERROR_UNKNOWN)
5044 error_handler = get_error_handler(errors);
5045
5046 switch (error_handler) {
5047 case _Py_ERROR_IGNORE:
5048 s += (endinpos - startinpos);
5049 break;
5050
5051 case _Py_ERROR_REPLACE:
5052 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5053 goto onError;
5054 s += (endinpos - startinpos);
5055 break;
5056
5057 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005058 {
5059 Py_ssize_t i;
5060
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5062 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005063 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005064 ch = (Py_UCS4)(unsigned char)(starts[i]);
5065 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5066 ch + 0xdc00);
5067 writer.pos++;
5068 }
5069 s += (endinpos - startinpos);
5070 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005071 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005072
5073 default:
5074 if (unicode_decode_call_errorhandler_writer(
5075 errors, &error_handler_obj,
5076 "utf-8", errmsg,
5077 &starts, &end, &startinpos, &endinpos, &exc, &s,
5078 &writer))
5079 goto onError;
5080 }
Victor Stinner785938e2011-12-11 20:09:03 +01005081 }
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 if (consumed)
5085 *consumed = s - starts;
5086
Victor Stinner1d65d912015-10-05 13:43:50 +02005087 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005089 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090
5091onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005092 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005094 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005096}
5097
Xavier de Gaye76febd02016-12-15 20:59:58 +01005098#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099
5100/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005101 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005102
5103 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005104 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105
5106wchar_t*
5107_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5108{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 wchar_t *unicode;
5111 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112
5113 /* Note: size will always be longer than the resulting Unicode
5114 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005115 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005117 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118 if (!unicode)
5119 return NULL;
5120
5121 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005124 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005127 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 if (ch > 0xFF) {
5132#if SIZEOF_WCHAR_T == 4
5133 assert(0);
5134#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005135 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 /* compute and append the two surrogates: */
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5139#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 else {
5142 if (!ch && s == e)
5143 break;
5144 /* surrogateescape */
5145 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5146 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149 return unicode;
5150}
5151
Xavier de Gaye76febd02016-12-15 20:59:58 +01005152#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154/* Primary internal function which creates utf8 encoded bytes objects.
5155
5156 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005157 and allocate exactly as much space needed at the end. Else allocate the
5158 maximum possible needed (4 result bytes per Unicode character), and return
5159 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005160*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005161PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005162_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Victor Stinner6099a032011-12-18 14:22:26 +01005164 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 void *data;
5166 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 if (!PyUnicode_Check(unicode)) {
5169 PyErr_BadArgument();
5170 return NULL;
5171 }
5172
5173 if (PyUnicode_READY(unicode) == -1)
5174 return NULL;
5175
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005176 if (PyUnicode_UTF8(unicode))
5177 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5178 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179
5180 kind = PyUnicode_KIND(unicode);
5181 data = PyUnicode_DATA(unicode);
5182 size = PyUnicode_GET_LENGTH(unicode);
5183
Benjamin Petersonead6b532011-12-20 17:23:42 -06005184 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005185 default:
5186 assert(0);
5187 case PyUnicode_1BYTE_KIND:
5188 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5189 assert(!PyUnicode_IS_ASCII(unicode));
5190 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_2BYTE_KIND:
5192 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5193 case PyUnicode_4BYTE_KIND:
5194 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5200 Py_ssize_t size,
5201 const char *errors)
5202{
5203 PyObject *v, *unicode;
5204
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005205 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206 if (unicode == NULL)
5207 return NULL;
5208 v = _PyUnicode_AsUTF8String(unicode, errors);
5209 Py_DECREF(unicode);
5210 return v;
5211}
5212
5213PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005214PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005216 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217}
5218
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219/* --- UTF-32 Codec ------------------------------------------------------- */
5220
5221PyObject *
5222PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 Py_ssize_t size,
5224 const char *errors,
5225 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226{
5227 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5228}
5229
5230PyObject *
5231PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236{
5237 const char *starts = s;
5238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005241 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005242 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 PyObject *errorHandler = NULL;
5246 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005247
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 q = (unsigned char *)s;
5249 e = q + size;
5250
5251 if (byteorder)
5252 bo = *byteorder;
5253
5254 /* Check for BOM marks (U+FEFF) in the input and adjust current
5255 byte order setting accordingly. In native mode, the leading BOM
5256 mark is skipped, in all other modes, it is copied to the output
5257 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005258 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005259 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (bom == 0x0000FEFF) {
5261 bo = -1;
5262 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 else if (bom == 0xFFFE0000) {
5265 bo = 1;
5266 q += 4;
5267 }
5268 if (byteorder)
5269 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005270 }
5271
Victor Stinnere64322e2012-10-30 23:12:47 +01005272 if (q == e) {
5273 if (consumed)
5274 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005275 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276 }
5277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278#ifdef WORDS_BIGENDIAN
5279 le = bo < 0;
5280#else
5281 le = bo <= 0;
5282#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005284
Victor Stinner8f674cc2013-04-17 23:02:17 +02005285 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005286 writer.min_length = (e - q + 3) / 4;
5287 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005288 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005289
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 while (1) {
5291 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005293
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 enum PyUnicode_Kind kind = writer.kind;
5296 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (le) {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
5311 else {
5312 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005313 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (ch > maxch)
5315 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 if (kind != PyUnicode_1BYTE_KIND &&
5317 Py_UNICODE_IS_SURROGATE(ch))
5318 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005319 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 q += 4;
5321 } while (q <= last);
5322 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 }
5325
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005326 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005327 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005328 startinpos = ((const char *)q) - starts;
5329 endinpos = startinpos + 4;
5330 }
5331 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005334 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 startinpos = ((const char *)q) - starts;
5337 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 else {
5340 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005341 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005342 goto onError;
5343 q += 4;
5344 continue;
5345 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005346 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 startinpos = ((const char *)q) - starts;
5348 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005350
5351 /* The remaining input chars are ignored if the callback
5352 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005353 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005355 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 }
5360
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005369 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 Py_XDECREF(errorHandler);
5371 Py_XDECREF(exc);
5372 return NULL;
5373}
5374
5375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376_PyUnicode_EncodeUTF32(PyObject *str,
5377 const char *errors,
5378 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 enum PyUnicode_Kind kind;
5381 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005382 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005383 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005384 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005385#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005389#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005392 PyObject *errorHandler = NULL;
5393 PyObject *exc = NULL;
5394 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005407 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005409 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005410 if (v == NULL)
5411 return NULL;
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 /* output buffer is 4-bytes aligned */
5414 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005415 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 else
5426 encoding = "utf-32";
5427
5428 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5430 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 pos = 0;
5434 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436
5437 if (kind == PyUnicode_2BYTE_KIND) {
5438 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5439 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 else {
5442 assert(kind == PyUnicode_4BYTE_KIND);
5443 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5444 &out, native_ordering);
5445 }
5446 if (pos == len)
5447 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005448
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 rep = unicode_encode_call_errorhandler(
5450 errors, &errorHandler,
5451 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 if (!rep)
5454 goto error;
5455
5456 if (PyBytes_Check(rep)) {
5457 repsize = PyBytes_GET_SIZE(rep);
5458 if (repsize & 3) {
5459 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 "surrogates not allowed");
5462 goto error;
5463 }
5464 moreunits = repsize / 4;
5465 }
5466 else {
5467 assert(PyUnicode_Check(rep));
5468 if (PyUnicode_READY(rep) < 0)
5469 goto error;
5470 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5471 if (!PyUnicode_IS_ASCII(rep)) {
5472 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 "surrogates not allowed");
5475 goto error;
5476 }
5477 }
5478
5479 /* four bytes are reserved for each surrogate */
5480 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005481 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005482 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 /* integer overflow */
5484 PyErr_NoMemory();
5485 goto error;
5486 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005487 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005489 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 }
5491
5492 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005493 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5498 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 }
5500
5501 Py_CLEAR(rep);
5502 }
5503
5504 /* Cut back to size actually needed. This is necessary for, for example,
5505 encoding of a string containing isolated surrogates and the 'ignore'
5506 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005507 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 if (nsize != PyBytes_GET_SIZE(v))
5509 _PyBytes_Resize(&v, nsize);
5510 Py_XDECREF(errorHandler);
5511 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 error:
5515 Py_XDECREF(rep);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
5518 Py_XDECREF(v);
5519 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520}
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005529 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539{
Victor Stinnerb960b342011-11-20 19:12:52 +01005540 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005541}
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543/* --- UTF-16 Codec ------------------------------------------------------- */
5544
Tim Peters772747b2001-08-09 22:21:55 +00005545PyObject *
5546PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 Py_ssize_t size,
5548 const char *errors,
5549 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Walter Dörwald69652032004-09-07 20:24:22 +00005551 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5552}
5553
5554PyObject *
5555PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder,
5559 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005566 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005568 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Tim Peters772747b2001-08-09 22:21:55 +00005573 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
5576 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005577 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 if (bo == 0 && size >= 2) {
5584 const Py_UCS4 bom = (q[1] << 8) | q[0];
5585 if (bom == 0xFEFF) {
5586 q += 2;
5587 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 else if (bom == 0xFFFE) {
5590 q += 2;
5591 bo = 1;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005600 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005601 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602
Christian Heimes743e0cd2012-10-17 23:52:17 +02005603#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005606#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005609#endif
Tim Peters772747b2001-08-09 22:21:55 +00005610
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 /* Note: size will always be longer than the resulting Unicode
5612 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005613 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005614 writer.min_length = (e - q + 1) / 2;
5615 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 while (1) {
5619 Py_UCS4 ch = 0;
5620 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 native_ordering);
5627 else
5628 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 native_ordering);
5631 } else if (kind == PyUnicode_2BYTE_KIND) {
5632 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 } else {
5636 assert(kind == PyUnicode_4BYTE_KIND);
5637 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 switch (ch)
5644 {
5645 case 0:
5646 /* remaining byte at the end? (size should be even) */
5647 if (q == e || consumed)
5648 goto End;
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
5652 break;
5653 /* The remaining input chars are ignored if the callback
5654 chooses to skip the input */
5655 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005656 q -= 2;
5657 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005658 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005660 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 endinpos = ((const char *)e) - starts;
5662 break;
5663 case 2:
5664 errmsg = "illegal encoding";
5665 startinpos = ((const char *)q) - 2 - starts;
5666 endinpos = startinpos + 2;
5667 break;
5668 case 3:
5669 errmsg = "illegal UTF-16 surrogate";
5670 startinpos = ((const char *)q) - 4 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005674 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 continue;
5677 }
5678
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005680 errors,
5681 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005682 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005683 &starts,
5684 (const char **)&e,
5685 &startinpos,
5686 &endinpos,
5687 &exc,
5688 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692
Antoine Pitrou63065d72012-05-15 23:48:04 +02005693End:
Walter Dörwald69652032004-09-07 20:24:22 +00005694 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706}
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 enum PyUnicode_Kind kind;
5714 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005719#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005723#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 const char *encoding;
5725 Py_ssize_t nsize, pos;
5726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005741 if (kind == PyUnicode_4BYTE_KIND) {
5742 const Py_UCS4 *in = (const Py_UCS4 *)data;
5743 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 while (in < end) {
5745 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005746 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
5748 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 nsize = len + pairs + (byteorder == 0);
5754 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005760 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
5765 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Tim Peters772747b2001-08-09 22:21:55 +00005768
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 if (kind == PyUnicode_1BYTE_KIND) {
5770 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
5780 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783
5784 pos = 0;
5785 while (pos < len) {
5786 Py_ssize_t repsize, moreunits;
5787
5788 if (kind == PyUnicode_2BYTE_KIND) {
5789 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5790 &out, native_ordering);
5791 }
5792 else {
5793 assert(kind == PyUnicode_4BYTE_KIND);
5794 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 if (pos == len)
5798 break;
5799
5800 rep = unicode_encode_call_errorhandler(
5801 errors, &errorHandler,
5802 encoding, "surrogates not allowed",
5803 str, &exc, pos, pos + 1, &pos);
5804 if (!rep)
5805 goto error;
5806
5807 if (PyBytes_Check(rep)) {
5808 repsize = PyBytes_GET_SIZE(rep);
5809 if (repsize & 1) {
5810 raise_encode_exception(&exc, encoding,
5811 str, pos - 1, pos,
5812 "surrogates not allowed");
5813 goto error;
5814 }
5815 moreunits = repsize / 2;
5816 }
5817 else {
5818 assert(PyUnicode_Check(rep));
5819 if (PyUnicode_READY(rep) < 0)
5820 goto error;
5821 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5822 if (!PyUnicode_IS_ASCII(rep)) {
5823 raise_encode_exception(&exc, encoding,
5824 str, pos - 1, pos,
5825 "surrogates not allowed");
5826 goto error;
5827 }
5828 }
5829
5830 /* two bytes are reserved for each surrogate */
5831 if (moreunits > 1) {
5832 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005833 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 /* integer overflow */
5835 PyErr_NoMemory();
5836 goto error;
5837 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005838 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 goto error;
5840 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5841 }
5842
5843 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005844 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 out += moreunits;
5846 } else /* rep is unicode */ {
5847 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5848 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5849 &out, native_ordering);
5850 }
5851
5852 Py_CLEAR(rep);
5853 }
5854
5855 /* Cut back to size actually needed. This is necessary for, for example,
5856 encoding of a string containing isolated surrogates and the 'ignore' handler
5857 is used. */
5858 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5859 if (nsize != PyBytes_GET_SIZE(v))
5860 _PyBytes_Resize(&v, nsize);
5861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005863 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005864 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 error:
5866 Py_XDECREF(rep);
5867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
5869 Py_XDECREF(v);
5870 return NULL;
5871#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872}
5873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5876 Py_ssize_t size,
5877 const char *errors,
5878 int byteorder)
5879{
5880 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005881 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 if (tmp == NULL)
5883 return NULL;
5884 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5885 Py_DECREF(tmp);
5886 return result;
5887}
5888
5889PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893}
5894
5895/* --- Unicode Escape Codec ----------------------------------------------- */
5896
Fredrik Lundh06d12682001-01-24 07:59:11 +00005897static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005900_PyUnicode_DecodeUnicodeEscape(const char *s,
5901 Py_ssize_t size,
5902 const char *errors,
5903 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 PyObject *errorHandler = NULL;
5909 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005910
Eric V. Smith42454af2016-10-31 09:22:08 -04005911 // so we can remember if we've seen an invalid escape char or not
5912 *first_invalid_escape = NULL;
5913
Victor Stinner62ec3312016-09-06 17:04:34 -07005914 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005915 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005916 }
5917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
5919 length after conversion to the true value.
5920 (but if the error callback returns a long replacement string
5921 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005922 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005923 writer.min_length = size;
5924 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5925 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005926 }
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 end = s + size;
5929 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005930 unsigned char c = (unsigned char) *s++;
5931 Py_UCS4 ch;
5932 int count;
5933 Py_ssize_t startinpos;
5934 Py_ssize_t endinpos;
5935 const char *message;
5936
5937#define WRITE_ASCII_CHAR(ch) \
5938 do { \
5939 assert(ch <= 127); \
5940 assert(writer.pos < writer.size); \
5941 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5942 } while(0)
5943
5944#define WRITE_CHAR(ch) \
5945 do { \
5946 if (ch <= writer.maxchar) { \
5947 assert(writer.pos < writer.size); \
5948 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5949 } \
5950 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5951 goto onError; \
5952 } \
5953 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 if (c != '\\') {
5957 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 continue;
5959 }
5960
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 if (s >= end) {
5964 message = "\\ at end of string";
5965 goto error;
5966 }
5967 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005970 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 case '\n': continue;
5974 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5975 case '\'': WRITE_ASCII_CHAR('\''); continue;
5976 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5977 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5980 case 't': WRITE_ASCII_CHAR('\t'); continue;
5981 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5982 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 case '0': case '1': case '2': case '3':
5990 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005992 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 ch = (ch<<3) + *s++ - '0';
5994 if (s < end && '0' <= *s && *s <= '7') {
5995 ch = (ch<<3) + *s++ - '0';
5996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 WRITE_CHAR(ch);
5999 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* hex escapes */
6002 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006004 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006005 message = "truncated \\xXX escape";
6006 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006011 message = "truncated \\uXXXX escape";
6012 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006015 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017 message = "truncated \\UXXXXXXXX escape";
6018 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006019 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006020 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 ch <<= 4;
6022 if (c >= '0' && c <= '9') {
6023 ch += c - '0';
6024 }
6025 else if (c >= 'a' && c <= 'f') {
6026 ch += c - ('a' - 10);
6027 }
6028 else if (c >= 'A' && c <= 'F') {
6029 ch += c - ('A' - 10);
6030 }
6031 else {
6032 break;
6033 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006034 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006035 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006036 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 }
6038
6039 /* when we get here, ch is a 32-bit unicode character */
6040 if (ch > MAX_UNICODE) {
6041 message = "illegal Unicode character";
6042 goto error;
6043 }
6044
6045 WRITE_CHAR(ch);
6046 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 if (ucnhash_CAPI == NULL) {
6051 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6053 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 if (ucnhash_CAPI == NULL) {
6055 PyErr_SetString(
6056 PyExc_UnicodeError,
6057 "\\N escapes not supported (can't load unicodedata module)"
6058 );
6059 goto onError;
6060 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006062
6063 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 const char *start = ++s;
6066 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006067 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 namelen = s - start;
6071 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 ch = 0xffffffff; /* in case 'getcode' messes up */
6075 if (namelen <= INT_MAX &&
6076 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6077 &ch, 0)) {
6078 assert(ch <= MAX_UNICODE);
6079 WRITE_CHAR(ch);
6080 continue;
6081 }
6082 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 }
6084 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006085 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086
6087 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006088 if (*first_invalid_escape == NULL) {
6089 *first_invalid_escape = s-1; /* Back up one char, since we've
6090 already incremented s. */
6091 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 WRITE_ASCII_CHAR('\\');
6093 WRITE_CHAR(c);
6094 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096
6097 error:
6098 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006099 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006100 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101 errors, &errorHandler,
6102 "unicodeescape", message,
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 }
6107 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6108 goto onError;
6109 }
6110
6111#undef WRITE_ASCII_CHAR
6112#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124}
6125
Eric V. Smith42454af2016-10-31 09:22:08 -04006126PyObject *
6127PyUnicode_DecodeUnicodeEscape(const char *s,
6128 Py_ssize_t size,
6129 const char *errors)
6130{
6131 const char *first_invalid_escape;
6132 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6133 &first_invalid_escape);
6134 if (result == NULL)
6135 return NULL;
6136 if (first_invalid_escape != NULL) {
6137 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6138 "invalid escape sequence '\\%c'",
6139 *first_invalid_escape) < 0) {
6140 Py_DECREF(result);
6141 return NULL;
6142 }
6143 }
6144 return result;
6145}
6146
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006147/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Ezio Melottie7f90372012-10-05 03:33:31 +03006159 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006160 escape.
6161
Ezio Melottie7f90372012-10-05 03:33:31 +03006162 For UCS1 strings it's '\xxx', 4 bytes per source character.
6163 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6164 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006165 */
6166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 if (!PyUnicode_Check(unicode)) {
6168 PyErr_BadArgument();
6169 return NULL;
6170 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 }
Victor Stinner358af132015-10-12 22:36:57 +02006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (len == 0) {
6177 return PyBytes_FromStringAndSize(NULL, 0);
6178 }
6179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 kind = PyUnicode_KIND(unicode);
6181 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006185 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 return PyErr_NoMemory();
6187 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (repr == NULL) {
6190 return NULL;
6191 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 /* U+0000-U+00ff range */
6198 if (ch < 0x100) {
6199 if (ch >= ' ' && ch < 127) {
6200 if (ch != '\\') {
6201 /* Copy printable US ASCII as-is */
6202 *p++ = (char) ch;
6203 }
6204 /* Escape backslashes */
6205 else {
6206 *p++ = '\\';
6207 *p++ = '\\';
6208 }
6209 }
Victor Stinner358af132015-10-12 22:36:57 +02006210
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* Map special whitespace to '\t', \n', '\r' */
6212 else if (ch == '\t') {
6213 *p++ = '\\';
6214 *p++ = 't';
6215 }
6216 else if (ch == '\n') {
6217 *p++ = '\\';
6218 *p++ = 'n';
6219 }
6220 else if (ch == '\r') {
6221 *p++ = '\\';
6222 *p++ = 'r';
6223 }
6224
6225 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6226 else {
6227 *p++ = '\\';
6228 *p++ = 'x';
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
Tim Petersced69f82003-09-16 20:30:58 +00006232 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006233 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 *p++ = '\\';
6236 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006237 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6243 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 /* Make sure that the first two digits are zero */
6246 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 *p++ = 'U';
6249 *p++ = '0';
6250 *p++ = '0';
6251 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6256 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 assert(p - PyBytes_AS_STRING(repr) > 0);
6261 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6262 return NULL;
6263 }
6264 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006271 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006272 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006273 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 }
6276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 result = PyUnicode_AsUnicodeEscapeString(tmp);
6278 Py_DECREF(tmp);
6279 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Raw Unicode Escape Codec ------------------------------------------- */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006290 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 PyObject *errorHandler = NULL;
6293 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 /* Escaped strings will always be longer than the resulting
6300 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 length after conversion to the true value. (But decoding error
6302 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006303 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 writer.min_length = size;
6305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6306 goto onError;
6307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 end = s + size;
6310 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 unsigned char c = (unsigned char) *s++;
6312 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006313 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 Py_ssize_t startinpos;
6315 Py_ssize_t endinpos;
6316 const char *message;
6317
6318#define WRITE_CHAR(ch) \
6319 do { \
6320 if (ch <= writer.maxchar) { \
6321 assert(writer.pos < writer.size); \
6322 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6323 } \
6324 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6325 goto onError; \
6326 } \
6327 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (c != '\\' || s >= end) {
6331 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006334
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 c = (unsigned char) *s++;
6336 if (c == 'u') {
6337 count = 4;
6338 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 else if (c == 'U') {
6341 count = 8;
6342 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 }
6344 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 assert(writer.pos < writer.size);
6346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6347 WRITE_CHAR(c);
6348 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 startinpos = s - starts - 2;
6351
6352 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6353 for (ch = 0; count && s < end; ++s, --count) {
6354 c = (unsigned char)*s;
6355 ch <<= 4;
6356 if (c >= '0' && c <= '9') {
6357 ch += c - '0';
6358 }
6359 else if (c >= 'a' && c <= 'f') {
6360 ch += c - ('a' - 10);
6361 }
6362 else if (c >= 'A' && c <= 'F') {
6363 ch += c - ('A' - 10);
6364 }
6365 else {
6366 break;
6367 }
6368 }
6369 if (!count) {
6370 if (ch <= MAX_UNICODE) {
6371 WRITE_CHAR(ch);
6372 continue;
6373 }
6374 message = "\\Uxxxxxxxx out of range";
6375 }
6376
6377 endinpos = s-starts;
6378 writer.min_length = end - s + writer.pos;
6379 if (unicode_decode_call_errorhandler_writer(
6380 errors, &errorHandler,
6381 "rawunicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
6383 &writer)) {
6384 goto onError;
6385 }
6386 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6387 goto onError;
6388 }
6389
6390#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006394 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006395
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402}
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404
Alexander Belopolsky40018472011-02-26 01:02:56 +00006405PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 int kind;
6412 void *data;
6413 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 if (!PyUnicode_Check(unicode)) {
6416 PyErr_BadArgument();
6417 return NULL;
6418 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
6424 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 if (kind == PyUnicode_1BYTE_KIND) {
6426 return PyBytes_FromStringAndSize(data, len);
6427 }
Victor Stinner0e368262011-11-10 20:12:49 +01006428
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6430 bytes, and 1 byte characters 4. */
6431 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (len > PY_SSIZE_T_MAX / expandsize) {
6434 return PyErr_NoMemory();
6435 }
6436 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6437 if (repr == NULL) {
6438 return NULL;
6439 }
6440 if (len == 0) {
6441 return repr;
6442 }
6443
6444 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 for (pos = 0; pos < len; pos++) {
6446 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006447
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6449 if (ch < 0x100) {
6450 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6453 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 *p++ = '\\';
6455 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006456 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6459 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6462 else {
6463 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6464 *p++ = '\\';
6465 *p++ = 'U';
6466 *p++ = '0';
6467 *p++ = '0';
6468 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6473 *p++ = Py_hexdigits[ch & 15];
6474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 assert(p > PyBytes_AS_STRING(repr));
6478 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6479 return NULL;
6480 }
6481 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006489 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006491 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6493 Py_DECREF(tmp);
6494 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497/* --- Unicode Internal Codec ------------------------------------------- */
6498
Alexander Belopolsky40018472011-02-26 01:02:56 +00006499PyObject *
6500_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t size,
6502 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006503{
6504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t startinpos;
6506 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006507 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 const char *end;
6509 const char *reason;
6510 PyObject *errorHandler = NULL;
6511 PyObject *exc = NULL;
6512
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006513 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006514 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006515 1))
6516 return NULL;
6517
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006518 if (size < 0) {
6519 PyErr_BadInternalCall();
6520 return NULL;
6521 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006522 if (size == 0)
6523 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006524
Victor Stinner8f674cc2013-04-17 23:02:17 +02006525 _PyUnicodeWriter_Init(&writer);
6526 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6527 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 }
6530 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006531
Victor Stinner8f674cc2013-04-17 23:02:17 +02006532 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006533 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006534 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006536 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537 endinpos = end-starts;
6538 reason = "truncated input";
6539 goto error;
6540 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006541 /* We copy the raw representation one byte at a time because the
6542 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 ((char *) &uch)[0] = s[0];
6544 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ((char *) &uch)[2] = s[2];
6547 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006548#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006550#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 /* We have to sanity check the raw data, otherwise doom looms for
6552 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006553 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554 endinpos = s - starts + Py_UNICODE_SIZE;
6555 reason = "illegal code point (> 0x10FFFF)";
6556 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 s += Py_UNICODE_SIZE;
6560#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006563 Py_UNICODE uch2;
6564 ((char *) &uch2)[0] = s[0];
6565 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006566 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 {
Victor Stinner551ac952011-11-29 22:58:13 +01006568 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006569 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572#endif
6573
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006574 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006576 continue;
6577
6578 error:
6579 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006580 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006581 errors, &errorHandler,
6582 "unicode_internal", reason,
6583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006586 }
6587
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006590 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006593 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
6597}
6598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599/* --- Latin-1 Codec ------------------------------------------------------ */
6600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
6602PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 Py_ssize_t size,
6604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006607 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static void
6612make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006614 PyObject *unicode,
6615 Py_ssize_t startpos, Py_ssize_t endpos,
6616 const char *reason)
6617{
6618 if (*exceptionObject == NULL) {
6619 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006621 encoding, unicode, startpos, endpos, reason);
6622 }
6623 else {
6624 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6625 goto onError;
6626 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6627 goto onError;
6628 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6629 goto onError;
6630 return;
6631 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006632 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006633 }
6634}
6635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637static void
6638raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006639 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006640 PyObject *unicode,
6641 Py_ssize_t startpos, Py_ssize_t endpos,
6642 const char *reason)
6643{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006644 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006645 encoding, unicode, startpos, endpos, reason);
6646 if (*exceptionObject != NULL)
6647 PyCodec_StrictErrors(*exceptionObject);
6648}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649
6650/* error handling callback helper:
6651 build arguments, call the callback and check the arguments,
6652 put the result into newpos and return the replacement string, which
6653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654static PyObject *
6655unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 PyObject **errorHandler,
6657 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 Py_ssize_t startpos, Py_ssize_t endpos,
6660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006662 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 PyObject *restuple;
6665 PyObject *resunicode;
6666
6667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 }
6672
Benjamin Petersonbac79492012-01-14 13:34:47 -05006673 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return NULL;
6675 len = PyUnicode_GET_LENGTH(unicode);
6676
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006677 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006682 restuple = PyObject_CallFunctionObjArgs(
6683 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006687 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 Py_DECREF(restuple);
6689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 &resunicode, newpos)) {
6693 Py_DECREF(restuple);
6694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006696 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6697 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698 Py_DECREF(restuple);
6699 return NULL;
6700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 *newpos = len + *newpos;
6703 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006704 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 Py_DECREF(restuple);
6706 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 Py_INCREF(resunicode);
6709 Py_DECREF(restuple);
6710 return resunicode;
6711}
6712
Alexander Belopolsky40018472011-02-26 01:02:56 +00006713static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006715 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 /* input state */
6719 Py_ssize_t pos=0, size;
6720 int kind;
6721 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 /* pointer into the output */
6723 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006724 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6725 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006726 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006728 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006729 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006730 /* output object */
6731 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732
Benjamin Petersonbac79492012-01-14 13:34:47 -05006733 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 return NULL;
6735 size = PyUnicode_GET_LENGTH(unicode);
6736 kind = PyUnicode_KIND(unicode);
6737 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 /* allocate enough for a simple encoding without
6739 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006740 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006741 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006742
6743 _PyBytesWriter_Init(&writer);
6744 str = _PyBytesWriter_Alloc(&writer, size);
6745 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006749 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006752 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006754 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006758 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006761 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006763
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006764 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006766
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006767 /* Only overallocate the buffer if it's not the last write */
6768 writer.overallocate = (collend < size);
6769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006771 if (error_handler == _Py_ERROR_UNKNOWN)
6772 error_handler = get_error_handler(errors);
6773
6774 switch (error_handler) {
6775 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006776 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006778
6779 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006780 memset(str, '?', collend - collstart);
6781 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006782 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006783 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 break;
Victor Stinner50149202015-09-22 00:26:54 +02006786
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006788 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006789 writer.min_size -= (collend - collstart);
6790 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006792 if (str == NULL)
6793 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006794 pos = collend;
6795 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006797 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006798 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006799 writer.min_size -= (collend - collstart);
6800 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 unicode, collstart, collend);
6802 if (str == NULL)
6803 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 break;
Victor Stinner50149202015-09-22 00:26:54 +02006806
Victor Stinnerc3713e92015-09-29 12:32:13 +02006807 case _Py_ERROR_SURROGATEESCAPE:
6808 for (i = collstart; i < collend; ++i) {
6809 ch = PyUnicode_READ(kind, data, i);
6810 if (ch < 0xdc80 || 0xdcff < ch) {
6811 /* Not a UTF-8b surrogate */
6812 break;
6813 }
6814 *str++ = (char)(ch - 0xdc00);
6815 ++pos;
6816 }
6817 if (i >= collend)
6818 break;
6819 collstart = pos;
6820 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006821 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006824 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6825 encoding, reason, unicode, &exc,
6826 collstart, collend, &newpos);
6827 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006830 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006831 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006832
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006834 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006835 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006836 PyBytes_AS_STRING(rep),
6837 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006838 if (str == NULL)
6839 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006840 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 else {
6842 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006843
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006847 if (limit == 256 ?
6848 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6849 !PyUnicode_IS_ASCII(rep))
6850 {
6851 /* Not all characters are smaller than limit */
6852 raise_encode_exception(&exc, encoding, unicode,
6853 collstart, collend, reason);
6854 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006856 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6857 str = _PyBytesWriter_WriteBytes(&writer, str,
6858 PyUnicode_DATA(rep),
6859 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006861 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006863 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006864
6865 /* If overallocation was disabled, ensure that it was the last
6866 write. Otherwise, we missed an optimization */
6867 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
6869 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006870
Victor Stinner50149202015-09-22 00:26:54 +02006871 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006873 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
6875 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006876 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006878 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879 Py_XDECREF(exc);
6880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881}
6882
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 Py_ssize_t size,
6887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006890 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyErr_BadArgument();
6903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006931 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006932 int kind;
6933 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006946 if (size == 1 && (unsigned char)s[0] < 128)
6947 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006948
Victor Stinner8f674cc2013-04-17 23:02:17 +02006949 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006950 writer.min_length = size;
6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006952 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006956 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006957 writer.pos = outpos;
6958 if (writer.pos == size)
6959 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 s += writer.pos;
6962 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006964 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 PyUnicode_WRITE(kind, data, writer.pos, c);
6967 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006971
6972 /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974 if (error_handler == _Py_ERROR_UNKNOWN)
6975 error_handler = get_error_handler(errors);
6976
6977 switch (error_handler)
6978 {
6979 case _Py_ERROR_REPLACE:
6980 case _Py_ERROR_SURROGATEESCAPE:
6981 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006982 but we may switch to UCS2 at the first write */
6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984 goto onError;
6985 kind = writer.kind;
6986 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987
6988 if (error_handler == _Py_ERROR_REPLACE)
6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990 else
6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992 writer.pos++;
6993 ++s;
6994 break;
6995
6996 case _Py_ERROR_IGNORE:
6997 ++s;
6998 break;
6999
7000 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 startinpos = s-starts;
7002 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007003 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007004 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 "ascii", "ordinal not in range(128)",
7006 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 kind = writer.kind;
7010 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007013 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007016
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return NULL;
7022}
7023
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
7026PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007030 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007031 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 if (unicode == NULL)
7033 return NULL;
7034 result = unicode_encode_ucs1(unicode, errors, 128);
7035 Py_DECREF(unicode);
7036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037}
7038
Alexander Belopolsky40018472011-02-26 01:02:56 +00007039PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
7042 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 PyErr_BadArgument();
7044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046 if (PyUnicode_READY(unicode) == -1)
7047 return NULL;
7048 /* Fast path: if it is an ASCII-only string, construct bytes object
7049 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007050 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007054}
7055
7056PyObject *
7057PyUnicode_AsASCIIString(PyObject *unicode)
7058{
7059 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060}
7061
Steve Dowercc16be82016-09-08 10:35:16 -07007062#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007063
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007065
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007066#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067#define NEED_RETRY
7068#endif
7069
Victor Stinner3a50e702011-10-18 21:21:00 +02007070#ifndef WC_ERR_INVALID_CHARS
7071# define WC_ERR_INVALID_CHARS 0x0080
7072#endif
7073
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007074static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007075code_page_name(UINT code_page, PyObject **obj)
7076{
7077 *obj = NULL;
7078 if (code_page == CP_ACP)
7079 return "mbcs";
7080 if (code_page == CP_UTF7)
7081 return "CP_UTF7";
7082 if (code_page == CP_UTF8)
7083 return "CP_UTF8";
7084
7085 *obj = PyBytes_FromFormat("cp%u", code_page);
7086 if (*obj == NULL)
7087 return NULL;
7088 return PyBytes_AS_STRING(*obj);
7089}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090
Victor Stinner3a50e702011-10-18 21:21:00 +02007091static DWORD
7092decode_code_page_flags(UINT code_page)
7093{
7094 if (code_page == CP_UTF7) {
7095 /* The CP_UTF7 decoder only supports flags=0 */
7096 return 0;
7097 }
7098 else
7099 return MB_ERR_INVALID_CHARS;
7100}
7101
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007109static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007110decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 const char *in,
7113 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114{
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007116 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118
7119 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 assert(insize > 0);
7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122 if (outsize <= 0)
7123 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
7125 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007128 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 if (*v == NULL)
7130 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132 }
7133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007136 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139 }
7140
7141 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143 if (outsize <= 0)
7144 goto error;
7145 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007146
Victor Stinner3a50e702011-10-18 21:21:00 +02007147error:
7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152}
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154/*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007158 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161static int
7162decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007163 PyObject **v,
7164 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007165 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007166{
7167 const char *startin = in;
7168 const char *endin = in + size;
7169 const DWORD flags = decode_code_page_flags(code_page);
7170 /* Ideally, we should get reason from FormatMessage. This is the Windows
7171 2000 English version of the message. */
7172 const char *reason = "No mapping for the Unicode character exists "
7173 "in the target code page.";
7174 /* each step cannot decode more than 1 character, but a character can be
7175 represented as a surrogate pair */
7176 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007177 int insize;
7178 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 PyObject *errorHandler = NULL;
7180 PyObject *exc = NULL;
7181 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007182 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 DWORD err;
7184 int ret = -1;
7185
7186 assert(size > 0);
7187
7188 encoding = code_page_name(code_page, &encoding_obj);
7189 if (encoding == NULL)
7190 return -1;
7191
Victor Stinner7d00cc12014-03-17 23:08:06 +01007192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194 UnicodeDecodeError. */
7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196 if (exc != NULL) {
7197 PyCodec_StrictErrors(exc);
7198 Py_CLEAR(exc);
7199 }
7200 goto error;
7201 }
7202
7203 if (*v == NULL) {
7204 /* Create unicode object */
7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
Victor Stinnerab595942011-12-17 04:59:06 +01007209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 if (*v == NULL)
7212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v);
7214 }
7215 else {
7216 /* Extend unicode object */
7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219 PyErr_NoMemory();
7220 goto error;
7221 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v) + n;
7225 }
7226
7227 /* Decode the byte string character per character */
7228 out = startout;
7229 while (in < endin)
7230 {
7231 /* Decode a character */
7232 insize = 1;
7233 do
7234 {
7235 outsize = MultiByteToWideChar(code_page, flags,
7236 in, insize,
7237 buffer, Py_ARRAY_LENGTH(buffer));
7238 if (outsize > 0)
7239 break;
7240 err = GetLastError();
7241 if (err != ERROR_NO_UNICODE_TRANSLATION
7242 && err != ERROR_INSUFFICIENT_BUFFER)
7243 {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247 insize++;
7248 }
7249 /* 4=maximum length of a UTF-8 sequence */
7250 while (insize <= 4 && (in + insize) <= endin);
7251
7252 if (outsize <= 0) {
7253 Py_ssize_t startinpos, endinpos, outpos;
7254
Victor Stinner7d00cc12014-03-17 23:08:06 +01007255 /* last character in partial decode? */
7256 if (in + insize >= endin && !final)
7257 break;
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 startinpos = in - startin;
7260 endinpos = startinpos + 1;
7261 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007262 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 errors, &errorHandler,
7264 encoding, reason,
7265 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007266 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 {
7268 goto error;
7269 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 }
7272 else {
7273 in += insize;
7274 memcpy(out, buffer, outsize * sizeof(wchar_t));
7275 out += outsize;
7276 }
7277 }
7278
7279 /* write a NUL character at the end */
7280 *out = 0;
7281
7282 /* Extend unicode object */
7283 outsize = out - startout;
7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007285 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007287 /* (in - startin) <= size and size is an int */
7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007289
7290error:
7291 Py_XDECREF(encoding_obj);
7292 Py_XDECREF(errorHandler);
7293 Py_XDECREF(exc);
7294 return ret;
7295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297static PyObject *
7298decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 const char *s, Py_ssize_t size,
7300 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301{
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 PyObject *v = NULL;
7303 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 if (code_page < 0) {
7306 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307 return NULL;
7308 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007309 if (size < 0) {
7310 PyErr_BadInternalCall();
7311 return NULL;
7312 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 do
7318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 if (size > INT_MAX) {
7321 chunk_size = INT_MAX;
7322 final = 0;
7323 done = 0;
7324 }
7325 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 {
7328 chunk_size = (int)size;
7329 final = (consumed == NULL);
7330 done = 1;
7331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 if (chunk_size == 0 && done) {
7334 if (v != NULL)
7335 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007336 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 converted = decode_code_page_strict(code_page, &v,
7340 s, chunk_size);
7341 if (converted == -2)
7342 converted = decode_code_page_errors(code_page, &v,
7343 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007344 errors, final);
7345 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007346
7347 if (converted < 0) {
7348 Py_XDECREF(v);
7349 return NULL;
7350 }
7351
7352 if (consumed)
7353 *consumed += converted;
7354
7355 s += converted;
7356 size -= converted;
7357 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007358
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007359 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360}
7361
Alexander Belopolsky40018472011-02-26 01:02:56 +00007362PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007363PyUnicode_DecodeCodePageStateful(int code_page,
7364 const char *s,
7365 Py_ssize_t size,
7366 const char *errors,
7367 Py_ssize_t *consumed)
7368{
7369 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7370}
7371
7372PyObject *
7373PyUnicode_DecodeMBCSStateful(const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7379}
7380
7381PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyUnicode_DecodeMBCS(const char *s,
7383 Py_ssize_t size,
7384 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007385{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7387}
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389static DWORD
7390encode_code_page_flags(UINT code_page, const char *errors)
7391{
7392 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007393 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 }
7395 else if (code_page == CP_UTF7) {
7396 /* CP_UTF7 only supports flags=0 */
7397 return 0;
7398 }
7399 else {
7400 if (errors != NULL && strcmp(errors, "replace") == 0)
7401 return 0;
7402 else
7403 return WC_NO_BEST_FIT_CHARS;
7404 }
7405}
7406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 * Encode a Unicode string to a Windows code page into a byte string in strict
7409 * mode.
7410 *
7411 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007412 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007415encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007416 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418{
Victor Stinner554f3f02010-06-16 23:33:54 +00007419 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 BOOL *pusedDefaultChar = &usedDefaultChar;
7421 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007422 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, NULL);
7425 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 /* Create a substring so that we can get the UTF-16 representation
7427 of just the slice under consideration. */
7428 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007433 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007436
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 substring = PyUnicode_Substring(unicode, offset, offset+len);
7438 if (substring == NULL)
7439 return -1;
7440 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7441 if (p == NULL) {
7442 Py_DECREF(substring);
7443 return -1;
7444 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007445 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007447 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 NULL, 0,
7451 NULL, pusedDefaultChar);
7452 if (outsize <= 0)
7453 goto error;
7454 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 if (pusedDefaultChar && *pusedDefaultChar) {
7456 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 if (*outbytes == NULL) {
7464 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468 }
7469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 const Py_ssize_t n = PyBytes_Size(*outbytes);
7472 if (outsize > PY_SSIZE_T_MAX - n) {
7473 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7478 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 }
7483
7484 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007486 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 out, outsize,
7488 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 if (outsize <= 0)
7491 goto error;
7492 if (pusedDefaultChar && *pusedDefaultChar)
7493 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7499 return -2;
7500 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007505 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 * error handler.
7507 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007508 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 * -1 on other error.
7510 */
7511static int
7512encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515{
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_ssize_t pos = unicode_offset;
7518 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "invalid character";
7522 /* 4=maximum length of a UTF-8 sequence */
7523 char buffer[4];
7524 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7525 Py_ssize_t outsize;
7526 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007530 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *rep;
7533 int ret = -1;
7534
7535 assert(insize > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
7541 if (errors == NULL || strcmp(errors, "strict") == 0) {
7542 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7543 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007544 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_DECREF(exc);
7548 }
7549 Py_XDECREF(encoding_obj);
7550 return -1;
7551 }
7552
7553 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7554 pusedDefaultChar = &usedDefaultChar;
7555 else
7556 pusedDefaultChar = NULL;
7557
7558 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7559 PyErr_NoMemory();
7560 goto error;
7561 }
7562 outsize = insize * Py_ARRAY_LENGTH(buffer);
7563
7564 if (*outbytes == NULL) {
7565 /* Create string object */
7566 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7567 if (*outbytes == NULL)
7568 goto error;
7569 out = PyBytes_AS_STRING(*outbytes);
7570 }
7571 else {
7572 /* Extend string object */
7573 Py_ssize_t n = PyBytes_Size(*outbytes);
7574 if (n > PY_SSIZE_T_MAX - outsize) {
7575 PyErr_NoMemory();
7576 goto error;
7577 }
7578 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes) + n;
7581 }
7582
7583 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7587 wchar_t chars[2];
7588 int charsize;
7589 if (ch < 0x10000) {
7590 chars[0] = (wchar_t)ch;
7591 charsize = 1;
7592 }
7593 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007594 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7595 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 charsize = 2;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 buffer, Py_ARRAY_LENGTH(buffer),
7602 NULL, pusedDefaultChar);
7603 if (outsize > 0) {
7604 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7605 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 memcpy(out, buffer, outsize);
7608 out += outsize;
7609 continue;
7610 }
7611 }
7612 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7613 PyErr_SetFromWindowsErr(0);
7614 goto error;
7615 }
7616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 rep = unicode_encode_call_errorhandler(
7618 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007619 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 if (rep == NULL)
7622 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624
7625 if (PyBytes_Check(rep)) {
7626 outsize = PyBytes_GET_SIZE(rep);
7627 if (outsize != 1) {
7628 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7629 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7630 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7631 Py_DECREF(rep);
7632 goto error;
7633 }
7634 out = PyBytes_AS_STRING(*outbytes) + offset;
7635 }
7636 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7637 out += outsize;
7638 }
7639 else {
7640 Py_ssize_t i;
7641 enum PyUnicode_Kind kind;
7642 void *data;
7643
Benjamin Petersonbac79492012-01-14 13:34:47 -05007644 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 Py_DECREF(rep);
7646 goto error;
7647 }
7648
7649 outsize = PyUnicode_GET_LENGTH(rep);
7650 if (outsize != 1) {
7651 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7652 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7653 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7654 Py_DECREF(rep);
7655 goto error;
7656 }
7657 out = PyBytes_AS_STRING(*outbytes) + offset;
7658 }
7659 kind = PyUnicode_KIND(rep);
7660 data = PyUnicode_DATA(rep);
7661 for (i=0; i < outsize; i++) {
7662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7663 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007664 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 encoding, unicode,
7666 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 "unable to encode error handler result to ASCII");
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 *out = (unsigned char)ch;
7672 out++;
7673 }
7674 }
7675 Py_DECREF(rep);
7676 }
7677 /* write a NUL byte */
7678 *out = 0;
7679 outsize = out - PyBytes_AS_STRING(*outbytes);
7680 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7681 if (_PyBytes_Resize(outbytes, outsize) < 0)
7682 goto error;
7683 ret = 0;
7684
7685error:
7686 Py_XDECREF(encoding_obj);
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return ret;
7690}
7691
Victor Stinner3a50e702011-10-18 21:21:00 +02007692static PyObject *
7693encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 const char *errors)
7696{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007701
Victor Stinner29dacf22015-01-26 16:41:32 +01007702 if (!PyUnicode_Check(unicode)) {
7703 PyErr_BadArgument();
7704 return NULL;
7705 }
7706
Benjamin Petersonbac79492012-01-14 13:34:47 -05007707 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007708 return NULL;
7709 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (code_page < 0) {
7712 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7713 return NULL;
7714 }
7715
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 return PyBytes_FromStringAndSize(NULL, 0);
7718
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 offset = 0;
7720 do
7721 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007723 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunks. */
7725 if (len > INT_MAX/2) {
7726 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 0;
7728 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 done = 1;
7734 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007735
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 errors);
7739 if (ret == -2)
7740 ret = encode_code_page_errors(code_page, &outbytes,
7741 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 if (ret < 0) {
7744 Py_XDECREF(outbytes);
7745 return NULL;
7746 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007747
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007750 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 return outbytes;
7753}
7754
7755PyObject *
7756PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7757 Py_ssize_t size,
7758 const char *errors)
7759{
Victor Stinner7581cef2011-11-03 22:32:33 +01007760 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007761 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 if (unicode == NULL)
7763 return NULL;
7764 res = encode_code_page(CP_ACP, unicode, errors);
7765 Py_DECREF(unicode);
7766 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767}
7768
7769PyObject *
7770PyUnicode_EncodeCodePage(int code_page,
7771 PyObject *unicode,
7772 const char *errors)
7773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007776
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777PyObject *
7778PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007779{
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007781}
7782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007783#undef NEED_RETRY
7784
Steve Dowercc16be82016-09-08 10:35:16 -07007785#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787/* --- Character Mapping Codec -------------------------------------------- */
7788
Victor Stinnerfb161b12013-04-18 01:44:27 +02007789static int
7790charmap_decode_string(const char *s,
7791 Py_ssize_t size,
7792 PyObject *mapping,
7793 const char *errors,
7794 _PyUnicodeWriter *writer)
7795{
7796 const char *starts = s;
7797 const char *e;
7798 Py_ssize_t startinpos, endinpos;
7799 PyObject *errorHandler = NULL, *exc = NULL;
7800 Py_ssize_t maplen;
7801 enum PyUnicode_Kind mapkind;
7802 void *mapdata;
7803 Py_UCS4 x;
7804 unsigned char ch;
7805
7806 if (PyUnicode_READY(mapping) == -1)
7807 return -1;
7808
7809 maplen = PyUnicode_GET_LENGTH(mapping);
7810 mapdata = PyUnicode_DATA(mapping);
7811 mapkind = PyUnicode_KIND(mapping);
7812
7813 e = s + size;
7814
7815 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7816 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7817 * is disabled in encoding aliases, latin1 is preferred because
7818 * its implementation is faster. */
7819 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7820 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7821 Py_UCS4 maxchar = writer->maxchar;
7822
7823 assert (writer->kind == PyUnicode_1BYTE_KIND);
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs1[ch];
7827 if (x > maxchar) {
7828 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7829 goto onError;
7830 maxchar = writer->maxchar;
7831 outdata = (Py_UCS1 *)writer->data;
7832 }
7833 outdata[writer->pos] = x;
7834 writer->pos++;
7835 ++s;
7836 }
7837 return 0;
7838 }
7839
7840 while (s < e) {
7841 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7842 enum PyUnicode_Kind outkind = writer->kind;
7843 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7844 if (outkind == PyUnicode_1BYTE_KIND) {
7845 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7846 Py_UCS4 maxchar = writer->maxchar;
7847 while (s < e) {
7848 ch = *s;
7849 x = mapdata_ucs2[ch];
7850 if (x > maxchar)
7851 goto Error;
7852 outdata[writer->pos] = x;
7853 writer->pos++;
7854 ++s;
7855 }
7856 break;
7857 }
7858 else if (outkind == PyUnicode_2BYTE_KIND) {
7859 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7860 while (s < e) {
7861 ch = *s;
7862 x = mapdata_ucs2[ch];
7863 if (x == 0xFFFE)
7864 goto Error;
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 break;
7870 }
7871 }
7872 ch = *s;
7873
7874 if (ch < maplen)
7875 x = PyUnicode_READ(mapkind, mapdata, ch);
7876 else
7877 x = 0xfffe; /* invalid value */
7878Error:
7879 if (x == 0xfffe)
7880 {
7881 /* undefined mapping */
7882 startinpos = s-starts;
7883 endinpos = startinpos+1;
7884 if (unicode_decode_call_errorhandler_writer(
7885 errors, &errorHandler,
7886 "charmap", "character maps to <undefined>",
7887 &starts, &e, &startinpos, &endinpos, &exc, &s,
7888 writer)) {
7889 goto onError;
7890 }
7891 continue;
7892 }
7893
7894 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7895 goto onError;
7896 ++s;
7897 }
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return 0;
7901
7902onError:
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return -1;
7906}
7907
7908static int
7909charmap_decode_mapping(const char *s,
7910 Py_ssize_t size,
7911 PyObject *mapping,
7912 const char *errors,
7913 _PyUnicodeWriter *writer)
7914{
7915 const char *starts = s;
7916 const char *e;
7917 Py_ssize_t startinpos, endinpos;
7918 PyObject *errorHandler = NULL, *exc = NULL;
7919 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007920 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007921
7922 e = s + size;
7923
7924 while (s < e) {
7925 ch = *s;
7926
7927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 key = PyLong_FromLong((long)ch);
7929 if (key == NULL)
7930 goto onError;
7931
7932 item = PyObject_GetItem(mapping, key);
7933 Py_DECREF(key);
7934 if (item == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 goto Undefined;
7939 } else
7940 goto onError;
7941 }
7942
7943 /* Apply mapping */
7944 if (item == Py_None)
7945 goto Undefined;
7946 if (PyLong_Check(item)) {
7947 long value = PyLong_AS_LONG(item);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (value < 0 || value > MAX_UNICODE) {
7951 PyErr_Format(PyExc_TypeError,
7952 "character mapping must be in range(0x%lx)",
7953 (unsigned long)MAX_UNICODE + 1);
7954 goto onError;
7955 }
7956
7957 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7958 goto onError;
7959 }
7960 else if (PyUnicode_Check(item)) {
7961 if (PyUnicode_READY(item) == -1)
7962 goto onError;
7963 if (PyUnicode_GET_LENGTH(item) == 1) {
7964 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7965 if (value == 0xFFFE)
7966 goto Undefined;
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else {
7971 writer->overallocate = 1;
7972 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7973 goto onError;
7974 }
7975 }
7976 else {
7977 /* wrong return value */
7978 PyErr_SetString(PyExc_TypeError,
7979 "character mapping must return integer, None or str");
7980 goto onError;
7981 }
7982 Py_CLEAR(item);
7983 ++s;
7984 continue;
7985
7986Undefined:
7987 /* undefined mapping */
7988 Py_CLEAR(item);
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 }
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return 0;
8002
8003onError:
8004 Py_XDECREF(item);
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return -1;
8008}
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010PyObject *
8011PyUnicode_DecodeCharmap(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 /* Default to Latin-1 */
8019 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008023 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008024 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008025 writer.min_length = size;
8026 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008028
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008029 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008030 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008032 }
8033 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008040 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return NULL;
8042}
8043
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044/* Charmap encoding: the lookup table */
8045
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 PyObject_HEAD
8048 unsigned char level1[32];
8049 int count2, count3;
8050 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051};
8052
8053static PyObject*
8054encoding_map_size(PyObject *obj, PyObject* args)
8055{
8056 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059}
8060
8061static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 PyDoc_STR("Return the size (in bytes) of this object") },
8064 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065};
8066
8067static void
8068encoding_map_dealloc(PyObject* o)
8069{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071}
8072
8073static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "EncodingMap", /*tp_name*/
8076 sizeof(struct encoding_map), /*tp_basicsize*/
8077 0, /*tp_itemsize*/
8078 /* methods */
8079 encoding_map_dealloc, /*tp_dealloc*/
8080 0, /*tp_print*/
8081 0, /*tp_getattr*/
8082 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008083 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 0, /*tp_repr*/
8085 0, /*tp_as_number*/
8086 0, /*tp_as_sequence*/
8087 0, /*tp_as_mapping*/
8088 0, /*tp_hash*/
8089 0, /*tp_call*/
8090 0, /*tp_str*/
8091 0, /*tp_getattro*/
8092 0, /*tp_setattro*/
8093 0, /*tp_as_buffer*/
8094 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8095 0, /*tp_doc*/
8096 0, /*tp_traverse*/
8097 0, /*tp_clear*/
8098 0, /*tp_richcompare*/
8099 0, /*tp_weaklistoffset*/
8100 0, /*tp_iter*/
8101 0, /*tp_iternext*/
8102 encoding_map_methods, /*tp_methods*/
8103 0, /*tp_members*/
8104 0, /*tp_getset*/
8105 0, /*tp_base*/
8106 0, /*tp_dict*/
8107 0, /*tp_descr_get*/
8108 0, /*tp_descr_set*/
8109 0, /*tp_dictoffset*/
8110 0, /*tp_init*/
8111 0, /*tp_alloc*/
8112 0, /*tp_new*/
8113 0, /*tp_free*/
8114 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115};
8116
8117PyObject*
8118PyUnicode_BuildEncodingMap(PyObject* string)
8119{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyObject *result;
8121 struct encoding_map *mresult;
8122 int i;
8123 int need_dict = 0;
8124 unsigned char level1[32];
8125 unsigned char level2[512];
8126 unsigned char *mlevel1, *mlevel2, *mlevel3;
8127 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 int kind;
8129 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008130 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 PyErr_BadArgument();
8135 return NULL;
8136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 kind = PyUnicode_KIND(string);
8138 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 length = PyUnicode_GET_LENGTH(string);
8140 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 memset(level1, 0xFF, sizeof level1);
8142 memset(level2, 0xFF, sizeof level2);
8143
8144 /* If there isn't a one-to-one mapping of NULL to \0,
8145 or if there are non-BMP characters, we need to use
8146 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008149 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 ch = PyUnicode_READ(kind, data, i);
8152 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
8154 break;
8155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 /* unmapped character */
8158 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 l1 = ch >> 11;
8160 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (level1[l1] == 0xFF)
8162 level1[l1] = count2++;
8163 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 }
8166
8167 if (count2 >= 0xFF || count3 >= 0xFF)
8168 need_dict = 1;
8169
8170 if (need_dict) {
8171 PyObject *result = PyDict_New();
8172 PyObject *key, *value;
8173 if (!result)
8174 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008177 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 if (!key || !value)
8179 goto failed1;
8180 if (PyDict_SetItem(result, key, value) == -1)
8181 goto failed1;
8182 Py_DECREF(key);
8183 Py_DECREF(value);
8184 }
8185 return result;
8186 failed1:
8187 Py_XDECREF(key);
8188 Py_XDECREF(value);
8189 Py_DECREF(result);
8190 return NULL;
8191 }
8192
8193 /* Create a three-level trie */
8194 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8195 16*count2 + 128*count3 - 1);
8196 if (!result)
8197 return PyErr_NoMemory();
8198 PyObject_Init(result, &EncodingMapType);
8199 mresult = (struct encoding_map*)result;
8200 mresult->count2 = count2;
8201 mresult->count3 = count3;
8202 mlevel1 = mresult->level1;
8203 mlevel2 = mresult->level23;
8204 mlevel3 = mresult->level23 + 16*count2;
8205 memcpy(mlevel1, level1, 32);
8206 memset(mlevel2, 0xFF, 16*count2);
8207 memset(mlevel3, 0, 128*count3);
8208 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 /* unmapped character */
8214 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 o1 = ch>>11;
8216 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 i2 = 16*mlevel1[o1] + o2;
8218 if (mlevel2[i2] == 0xFF)
8219 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i3 = 128*mlevel2[i2] + o3;
8222 mlevel3[i3] = i;
8223 }
8224 return result;
8225}
8226
8227static int
Victor Stinner22168992011-11-20 17:09:18 +01008228encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229{
8230 struct encoding_map *map = (struct encoding_map*)mapping;
8231 int l1 = c>>11;
8232 int l2 = (c>>7) & 0xF;
8233 int l3 = c & 0x7F;
8234 int i;
8235
Victor Stinner22168992011-11-20 17:09:18 +01008236 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 if (c == 0)
8239 return 0;
8240 /* level 1*/
8241 i = map->level1[l1];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 2*/
8246 i = map->level23[16*i+l2];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 3 */
8251 i = map->level23[16*map->count2 + 128*i + l3];
8252 if (i == 0) {
8253 return -1;
8254 }
8255 return i;
8256}
8257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258/* Lookup the character ch in the mapping. If the character
8259 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008260 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008262charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263{
Christian Heimes217cfd12007-12-02 14:31:20 +00008264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 PyObject *x;
8266
8267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 x = PyObject_GetItem(mapping, w);
8270 Py_DECREF(w);
8271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8273 /* No mapping found means: mapping is undefined. */
8274 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008275 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 } else
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008279 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008281 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 long value = PyLong_AS_LONG(x);
8283 if (value < 0 || value > 255) {
8284 PyErr_SetString(PyExc_TypeError,
8285 "character mapping must be in range(256)");
8286 Py_DECREF(x);
8287 return NULL;
8288 }
8289 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 /* wrong return value */
8295 PyErr_Format(PyExc_TypeError,
8296 "character mapping must return integer, bytes or None, not %.400s",
8297 x->ob_type->tp_name);
8298 Py_DECREF(x);
8299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
8301}
8302
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008304charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8307 /* exponentially overallocate to minimize reallocations */
8308 if (requiredsize < 2*outsize)
8309 requiredsize = 2*outsize;
8310 if (_PyBytes_Resize(outobj, requiredsize))
8311 return -1;
8312 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313}
8314
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008319 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 space is available. Return a new reference to the object that
8321 was put in the output buffer, or Py_None, if the mapping was undefined
8322 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008323 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008325charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 PyObject *rep;
8329 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008330 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331
Christian Heimes90aa7642007-12-19 02:45:37 +00008332 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 if (res == -1)
8336 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (outsize<requiredsize)
8338 if (charmapencode_resize(outobj, outpos, requiredsize))
8339 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 outstart[(*outpos)++] = (char)res;
8342 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 }
8344
8345 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 Py_DECREF(rep);
8350 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 if (PyLong_Check(rep)) {
8353 Py_ssize_t requiredsize = *outpos+1;
8354 if (outsize<requiredsize)
8355 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8356 Py_DECREF(rep);
8357 return enc_EXCEPTION;
8358 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008359 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 else {
8363 const char *repchars = PyBytes_AS_STRING(rep);
8364 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8365 Py_ssize_t requiredsize = *outpos+repsize;
8366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8368 Py_DECREF(rep);
8369 return enc_EXCEPTION;
8370 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008371 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 memcpy(outstart + *outpos, repchars, repsize);
8373 *outpos += repsize;
8374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 Py_DECREF(rep);
8377 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378}
8379
8380/* handle an error in PyUnicode_EncodeCharmap
8381 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008382static int
8383charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008386 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008387 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388{
8389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008391 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008392 enum PyUnicode_Kind kind;
8393 void *data;
8394 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t collstartpos = *inpos;
8397 Py_ssize_t collendpos = *inpos+1;
8398 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 char *encoding = "charmap";
8400 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008401 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008403 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Benjamin Petersonbac79492012-01-14 13:34:47 -05008405 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 return -1;
8407 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* find all unencodable characters */
8409 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008411 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008413 val = encoding_map_lookup(ch, mapping);
8414 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 break;
8416 ++collendpos;
8417 continue;
8418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8421 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 if (rep==NULL)
8423 return -1;
8424 else if (rep!=Py_None) {
8425 Py_DECREF(rep);
8426 break;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 /* cache callback name lookup
8432 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008433 if (*error_handler == _Py_ERROR_UNKNOWN)
8434 *error_handler = get_error_handler(errors);
8435
8436 switch (*error_handler) {
8437 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008438 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008440
8441 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 x = charmapencode_output('?', mapping, res, respos);
8444 if (x==enc_EXCEPTION) {
8445 return -1;
8446 }
8447 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
8450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 }
8452 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008453 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 *inpos = collendpos;
8455 break;
Victor Stinner50149202015-09-22 00:26:54 +02008456
8457 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 /* generate replacement (temporarily (mis)uses p) */
8459 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 char buffer[2+29+1+1];
8461 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 for (cp = buffer; *cp; ++cp) {
8464 x = charmapencode_output(*cp, mapping, res, respos);
8465 if (x==enc_EXCEPTION)
8466 return -1;
8467 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
8470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 }
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 *inpos = collendpos;
8474 break;
Victor Stinner50149202015-09-22 00:26:54 +02008475
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 default:
Victor Stinner50149202015-09-22 00:26:54 +02008477 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008482 if (PyBytes_Check(repunicode)) {
8483 /* Directly copy bytes result to output. */
8484 Py_ssize_t outsize = PyBytes_Size(*res);
8485 Py_ssize_t requiredsize;
8486 repsize = PyBytes_Size(repunicode);
8487 requiredsize = *respos + repsize;
8488 if (requiredsize > outsize)
8489 /* Make room for all additional bytes. */
8490 if (charmapencode_resize(res, respos, requiredsize)) {
8491 Py_DECREF(repunicode);
8492 return -1;
8493 }
8494 memcpy(PyBytes_AsString(*res) + *respos,
8495 PyBytes_AsString(repunicode), repsize);
8496 *respos += repsize;
8497 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008498 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008499 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008502 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 Py_DECREF(repunicode);
8504 return -1;
8505 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008506 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 data = PyUnicode_DATA(repunicode);
8508 kind = PyUnicode_KIND(repunicode);
8509 for (index = 0; index < repsize; index++) {
8510 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8511 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008513 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return -1;
8515 }
8516 else if (x==enc_FAILED) {
8517 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008518 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
8520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008521 }
8522 *inpos = newpos;
8523 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
8525 return 0;
8526}
8527
Alexander Belopolsky40018472011-02-26 01:02:56 +00008528PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529_PyUnicode_EncodeCharmap(PyObject *unicode,
8530 PyObject *mapping,
8531 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 /* output object */
8534 PyObject *res = NULL;
8535 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008540 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008542 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008543 void *data;
8544 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Benjamin Petersonbac79492012-01-14 13:34:47 -05008546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547 return NULL;
8548 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008549 data = PyUnicode_DATA(unicode);
8550 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 /* Default to Latin-1 */
8553 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 /* allocate enough for a simple encoding without
8557 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008558 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 if (res == NULL)
8560 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008561 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008565 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (x==enc_EXCEPTION) /* error */
8569 goto onError;
8570 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008573 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 &res, &respos)) {
8575 goto onError;
8576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008577 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 else
8579 /* done with this character => adjust input position */
8580 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008584 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008585 if (_PyBytes_Resize(&res, respos) < 0)
8586 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008589 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 return res;
8591
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 Py_XDECREF(res);
8594 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008595 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 return NULL;
8597}
8598
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008599/* Deprecated */
8600PyObject *
8601PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8602 Py_ssize_t size,
8603 PyObject *mapping,
8604 const char *errors)
8605{
8606 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008607 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 if (unicode == NULL)
8609 return NULL;
8610 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8611 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008612 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613}
8614
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615PyObject *
8616PyUnicode_AsCharmapString(PyObject *unicode,
8617 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
8619 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 PyErr_BadArgument();
8621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008623 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
8625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static void
8628make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630 Py_ssize_t startpos, Py_ssize_t endpos,
8631 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 *exceptionObject = _PyUnicodeTranslateError_Create(
8635 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 }
8637 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8639 goto onError;
8640 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8643 goto onError;
8644 return;
8645 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008646 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 }
8648}
8649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650/* error handling callback helper:
8651 build arguments, call the callback and check the arguments,
8652 put the result into newpos and return the replacement string, which
8653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654static PyObject *
8655unicode_translate_call_errorhandler(const char *errors,
8656 PyObject **errorHandler,
8657 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008662 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008664 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 PyObject *restuple;
8666 PyObject *resunicode;
8667
8668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 }
8673
8674 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008679 restuple = PyObject_CallFunctionObjArgs(
8680 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 Py_DECREF(restuple);
8686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 &resunicode, &i_newpos)) {
8690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 else
8696 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008698 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 Py_DECREF(restuple);
8700 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_INCREF(resunicode);
8703 Py_DECREF(restuple);
8704 return resunicode;
8705}
8706
8707/* Lookup the character ch in the mapping and put the result in result,
8708 which must be decrefed by the caller.
8709 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712{
Christian Heimes217cfd12007-12-02 14:31:20 +00008713 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 PyObject *x;
8715
8716 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 x = PyObject_GetItem(mapping, w);
8719 Py_DECREF(w);
8720 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8722 /* No mapping found means: use 1:1 mapping. */
8723 PyErr_Clear();
8724 *result = NULL;
8725 return 0;
8726 } else
8727 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
8729 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 *result = x;
8731 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008733 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008735 if (value < 0 || value > MAX_UNICODE) {
8736 PyErr_Format(PyExc_ValueError,
8737 "character mapping must be in range(0x%x)",
8738 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(x);
8740 return -1;
8741 }
8742 *result = x;
8743 return 0;
8744 }
8745 else if (PyUnicode_Check(x)) {
8746 *result = x;
8747 return 0;
8748 }
8749 else {
8750 /* wrong return value */
8751 PyErr_SetString(PyExc_TypeError,
8752 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008753 Py_DECREF(x);
8754 return -1;
8755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756}
Victor Stinner1194ea02014-04-04 19:37:40 +02008757
8758/* lookup the character, write the result into the writer.
8759 Return 1 if the result was written into the writer, return 0 if the mapping
8760 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008762charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8763 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764{
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 PyObject *item;
8766
8767 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008769
8770 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008775 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008777
8778 if (item == Py_None) {
8779 Py_DECREF(item);
8780 return 0;
8781 }
8782
8783 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008784 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8785 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8786 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008787 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8788 Py_DECREF(item);
8789 return -1;
8790 }
8791 Py_DECREF(item);
8792 return 1;
8793 }
8794
8795 if (!PyUnicode_Check(item)) {
8796 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008798 }
8799
8800 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804
8805 Py_DECREF(item);
8806 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807}
8808
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809static int
8810unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8811 Py_UCS1 *translate)
8812{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008813 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814 int ret = 0;
8815
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 if (charmaptranslate_lookup(ch, mapping, &item)) {
8817 return -1;
8818 }
8819
8820 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008821 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008822 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008824 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 /* not found => default to 1:1 mapping */
8826 translate[ch] = ch;
8827 return 1;
8828 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008829 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008830 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008831 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8832 used it */
8833 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 /* invalid character or character outside ASCII:
8835 skip the fast translate */
8836 goto exit;
8837 }
8838 translate[ch] = (Py_UCS1)replace;
8839 }
8840 else if (PyUnicode_Check(item)) {
8841 Py_UCS4 replace;
8842
8843 if (PyUnicode_READY(item) == -1) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847 if (PyUnicode_GET_LENGTH(item) != 1)
8848 goto exit;
8849
8850 replace = PyUnicode_READ_CHAR(item, 0);
8851 if (replace > 127)
8852 goto exit;
8853 translate[ch] = (Py_UCS1)replace;
8854 }
8855 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 goto exit;
8858 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 ret = 1;
8860
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 exit:
8862 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 return ret;
8864}
8865
8866/* Fast path for ascii => ascii translation. Return 1 if the whole string
8867 was translated into writer, return 0 if the input string was partially
8868 translated into writer, raise an exception and return -1 on error. */
8869static int
8870unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008871 _PyUnicodeWriter *writer, int ignore,
8872 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873{
Victor Stinner872b2912014-04-05 14:27:07 +02008874 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 Py_ssize_t len;
8876 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008877 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 len = PyUnicode_GET_LENGTH(input);
8880
Victor Stinner872b2912014-04-05 14:27:07 +02008881 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
8883 in = PyUnicode_1BYTE_DATA(input);
8884 end = in + len;
8885
8886 assert(PyUnicode_IS_ASCII(writer->buffer));
8887 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8888 out = PyUnicode_1BYTE_DATA(writer->buffer);
8889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008892 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008894 int translate = unicode_fast_translate_lookup(mapping, ch,
8895 ascii_table);
8896 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008898 if (translate == 0)
8899 goto exit;
8900 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 }
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (ch2 == 0xfe) {
8903 if (ignore)
8904 continue;
8905 goto exit;
8906 }
8907 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008909 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 }
Victor Stinner872b2912014-04-05 14:27:07 +02008911 res = 1;
8912
8913exit:
8914 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008915 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008916 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917}
8918
Victor Stinner3222da22015-10-01 22:07:32 +02008919static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920_PyUnicode_TranslateCharmap(PyObject *input,
8921 PyObject *mapping,
8922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008925 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 Py_ssize_t size, i;
8927 int kind;
8928 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 _PyUnicodeWriter writer;
8930 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 char *reason = "character maps to <undefined>";
8932 PyObject *errorHandler = NULL;
8933 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 PyErr_BadArgument();
8939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (PyUnicode_READY(input) == -1)
8943 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008944 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 kind = PyUnicode_KIND(input);
8946 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008948 if (size == 0)
8949 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008951 /* allocate enough for a simple 1:1 translation without
8952 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 _PyUnicodeWriter_Init(&writer);
8954 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956
Victor Stinner872b2912014-04-05 14:27:07 +02008957 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8958
Victor Stinner33798672016-03-01 21:59:58 +01008959 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008961 if (PyUnicode_IS_ASCII(input)) {
8962 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8963 if (res < 0) {
8964 _PyUnicodeWriter_Dealloc(&writer);
8965 return NULL;
8966 }
8967 if (res == 1)
8968 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 }
Victor Stinner33798672016-03-01 21:59:58 +01008970 else {
8971 i = 0;
8972 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008976 int translate;
8977 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8978 Py_ssize_t newpos;
8979 /* startpos for collecting untranslatable chars */
8980 Py_ssize_t collstart;
8981 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 ch = PyUnicode_READ(kind, data, i);
8985 translate = charmaptranslate_output(ch, mapping, &writer);
8986 if (translate < 0)
8987 goto onError;
8988
8989 if (translate != 0) {
8990 /* it worked => adjust input pointer */
8991 ++i;
8992 continue;
8993 }
8994
8995 /* untranslatable character */
8996 collstart = i;
8997 collend = i+1;
8998
8999 /* find all untranslatable characters */
9000 while (collend < size) {
9001 PyObject *x;
9002 ch = PyUnicode_READ(kind, data, collend);
9003 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009004 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 Py_XDECREF(x);
9006 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009008 ++collend;
9009 }
9010
9011 if (ignore) {
9012 i = collend;
9013 }
9014 else {
9015 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9016 reason, input, &exc,
9017 collstart, collend, &newpos);
9018 if (repunicode == NULL)
9019 goto onError;
9020 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009022 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009023 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 Py_DECREF(repunicode);
9025 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009026 }
9027 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 Py_XDECREF(exc);
9029 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 Py_XDECREF(exc);
9035 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 return NULL;
9037}
9038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039/* Deprecated. Use PyUnicode_Translate instead. */
9040PyObject *
9041PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9042 Py_ssize_t size,
9043 PyObject *mapping,
9044 const char *errors)
9045{
Christian Heimes5f520f42012-09-11 14:03:25 +02009046 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009047 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (!unicode)
9049 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9051 Py_DECREF(unicode);
9052 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053}
9054
Alexander Belopolsky40018472011-02-26 01:02:56 +00009055PyObject *
9056PyUnicode_Translate(PyObject *str,
9057 PyObject *mapping,
9058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009060 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009061 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009062 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063}
Tim Petersced69f82003-09-16 20:30:58 +00009064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009066fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067{
9068 /* No need to call PyUnicode_READY(self) because this function is only
9069 called as a callback from fixup() which does it already. */
9070 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9071 const int kind = PyUnicode_KIND(self);
9072 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009073 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009074 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 Py_ssize_t i;
9076
9077 for (i = 0; i < len; ++i) {
9078 ch = PyUnicode_READ(kind, data, i);
9079 fixed = 0;
9080 if (ch > 127) {
9081 if (Py_UNICODE_ISSPACE(ch))
9082 fixed = ' ';
9083 else {
9084 const int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
9086 fixed = '0' + decimal;
9087 }
9088 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009089 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 PyUnicode_WRITE(kind, data, i, fixed);
9092 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009093 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009094 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 }
9097
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009098 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099}
9100
9101PyObject *
9102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9103{
9104 if (!PyUnicode_Check(unicode)) {
9105 PyErr_BadInternalCall();
9106 return NULL;
9107 }
9108 if (PyUnicode_READY(unicode) == -1)
9109 return NULL;
9110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9111 /* If the string is already ASCII, just return the same string */
9112 Py_INCREF(unicode);
9113 return unicode;
9114 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009115 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116}
9117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
Victor Stinner99d7ad02012-02-22 13:37:39 +01009128 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009134 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009135 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 }
9137 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009147 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009155 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157/* --- Decimal Encoder ---------------------------------------------------- */
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009165 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009166 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009167 enum PyUnicode_Kind kind;
9168 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169
9170 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 PyErr_BadArgument();
9172 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173 }
9174
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009175 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009176 if (unicode == NULL)
9177 return -1;
9178
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
9311 assert(0); result = -2;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
9385 assert(0);
9386 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009388 if (unicode != NULL && thousands_sep_kind != kind) {
9389 if (thousands_sep_kind < kind)
9390 PyMem_Free(thousands_sep_data);
9391 else
9392 PyMem_Free(data);
9393 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 if (unicode == NULL) {
9395 *maxchar = 127;
9396 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009397 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009398 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
9400 }
9401 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402}
9403
9404
Alexander Belopolsky40018472011-02-26 01:02:56 +00009405Py_ssize_t
9406PyUnicode_Count(PyObject *str,
9407 PyObject *substr,
9408 Py_ssize_t start,
9409 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009411 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 void *buf1 = NULL, *buf2 = NULL;
9414 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(substr);
9421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf1 = PyUnicode_DATA(str);
9431 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 if (!buf2)
9435 goto onError;
9436 }
9437
9438 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009440 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 result = asciilib_count(
9442 ((Py_UCS1*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 else
9446 result = ucs1lib_count(
9447 ((Py_UCS1*)buf1) + start, end - start,
9448 buf2, len2, PY_SSIZE_T_MAX
9449 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_count(
9453 ((Py_UCS2*)buf1) + start, end - start,
9454 buf2, len2, PY_SSIZE_T_MAX
9455 );
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_count(
9459 ((Py_UCS4*)buf1) + start, end - start,
9460 buf2, len2, PY_SSIZE_T_MAX
9461 );
9462 break;
9463 default:
9464 assert(0); result = 0;
9465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyMem_Free(buf2);
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009472 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 PyMem_Free(buf2);
9474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477Py_ssize_t
9478PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009479 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009487 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490Py_ssize_t
9491PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9492 Py_ssize_t start, Py_ssize_t end,
9493 int direction)
9494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009496 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (PyUnicode_READY(str) == -1)
9498 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009499 len = PyUnicode_GET_LENGTH(str);
9500 ADJUST_INDICES(start, end, len);
9501 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009504 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9505 kind, end-start, ch, direction);
9506 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 else
9509 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510}
9511
Alexander Belopolsky40018472011-02-26 01:02:56 +00009512static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009513tailmatch(PyObject *self,
9514 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009515 Py_ssize_t start,
9516 Py_ssize_t end,
9517 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 int kind_self;
9520 int kind_sub;
9521 void *data_self;
9522 void *data_sub;
9523 Py_ssize_t offset;
9524 Py_ssize_t i;
9525 Py_ssize_t end_sub;
9526
9527 if (PyUnicode_READY(self) == -1 ||
9528 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009529 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9532 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009536 if (PyUnicode_GET_LENGTH(substring) == 0)
9537 return 1;
9538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 kind_self = PyUnicode_KIND(self);
9540 data_self = PyUnicode_DATA(self);
9541 kind_sub = PyUnicode_KIND(substring);
9542 data_sub = PyUnicode_DATA(substring);
9543 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9544
9545 if (direction > 0)
9546 offset = end;
9547 else
9548 offset = start;
9549
9550 if (PyUnicode_READ(kind_self, data_self, offset) ==
9551 PyUnicode_READ(kind_sub, data_sub, 0) &&
9552 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9553 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9554 /* If both are of the same kind, memcmp is sufficient */
9555 if (kind_self == kind_sub) {
9556 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009557 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 data_sub,
9559 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009560 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009562 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 else {
9564 /* We do not need to compare 0 and len(substring)-1 because
9565 the if statement above ensured already that they are equal
9566 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 for (i = 1; i < end_sub; ++i) {
9568 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9569 PyUnicode_READ(kind_sub, data_sub, i))
9570 return 0;
9571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 }
9575
9576 return 0;
9577}
9578
Alexander Belopolsky40018472011-02-26 01:02:56 +00009579Py_ssize_t
9580PyUnicode_Tailmatch(PyObject *str,
9581 PyObject *substr,
9582 Py_ssize_t start,
9583 Py_ssize_t end,
9584 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009588
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590}
9591
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592/* Apply fixfct filter to the Unicode object self and return a
9593 reference to the modified object */
9594
Alexander Belopolsky40018472011-02-26 01:02:56 +00009595static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009596fixup(PyObject *self,
9597 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 PyObject *u;
9600 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009601 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009603 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009606 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 /* fix functions return the new maximum character in a string,
9609 if the kind of the resulting unicode object does not change,
9610 everything is fine. Otherwise we need to change the string kind
9611 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009612 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009613
9614 if (maxchar_new == 0) {
9615 /* no changes */;
9616 if (PyUnicode_CheckExact(self)) {
9617 Py_DECREF(u);
9618 Py_INCREF(self);
9619 return self;
9620 }
9621 else
9622 return u;
9623 }
9624
Victor Stinnere6abb482012-05-02 01:15:40 +02009625 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626
Victor Stinnereaab6042011-12-11 22:22:39 +01009627 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009629
9630 /* In case the maximum character changed, we need to
9631 convert the string to the new category. */
9632 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9633 if (v == NULL) {
9634 Py_DECREF(u);
9635 return NULL;
9636 }
9637 if (maxchar_new > maxchar_old) {
9638 /* If the maxchar increased so that the kind changed, not all
9639 characters are representable anymore and we need to fix the
9640 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009641 _PyUnicode_FastCopyCharacters(v, 0,
9642 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009643 maxchar_old = fixfct(v);
9644 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 }
9646 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009647 _PyUnicode_FastCopyCharacters(v, 0,
9648 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009650 Py_DECREF(u);
9651 assert(_PyUnicode_CheckConsistency(v, 1));
9652 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655static PyObject *
9656ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9659 char *resdata, *data = PyUnicode_DATA(self);
9660 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 res = PyUnicode_New(len, 127);
9663 if (res == NULL)
9664 return NULL;
9665 resdata = PyUnicode_DATA(res);
9666 if (lower)
9667 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 _Py_bytes_upper(resdata, data, len);
9670 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671}
9672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 Py_ssize_t j;
9677 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009678 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009680
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9682
9683 where ! is a negation and \p{xxx} is a character with property xxx.
9684 */
9685 for (j = i - 1; j >= 0; j--) {
9686 c = PyUnicode_READ(kind, data, j);
9687 if (!_PyUnicode_IsCaseIgnorable(c))
9688 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9691 if (final_sigma) {
9692 for (j = i + 1; j < length; j++) {
9693 c = PyUnicode_READ(kind, data, j);
9694 if (!_PyUnicode_IsCaseIgnorable(c))
9695 break;
9696 }
9697 final_sigma = j == length || !_PyUnicode_IsCased(c);
9698 }
9699 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702static int
9703lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9704 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 /* Obscure special case. */
9707 if (c == 0x3A3) {
9708 mapped[0] = handle_capital_sigma(kind, data, length, i);
9709 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712}
9713
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714static Py_ssize_t
9715do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 Py_ssize_t i, k = 0;
9718 int n_res, j;
9719 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009720
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 c = PyUnicode_READ(kind, data, 0);
9722 n_res = _PyUnicode_ToUpperFull(c, mapped);
9723 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009724 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 for (i = 1; i < length; i++) {
9728 c = PyUnicode_READ(kind, data, i);
9729 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9730 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009731 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009734 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738static Py_ssize_t
9739do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9740 Py_ssize_t i, k = 0;
9741
9742 for (i = 0; i < length; i++) {
9743 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9744 int n_res, j;
9745 if (Py_UNICODE_ISUPPER(c)) {
9746 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9747 }
9748 else if (Py_UNICODE_ISLOWER(c)) {
9749 n_res = _PyUnicode_ToUpperFull(c, mapped);
9750 }
9751 else {
9752 n_res = 1;
9753 mapped[0] = c;
9754 }
9755 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 res[k++] = mapped[j];
9758 }
9759 }
9760 return k;
9761}
9762
9763static Py_ssize_t
9764do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9765 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 Py_ssize_t i, k = 0;
9768
9769 for (i = 0; i < length; i++) {
9770 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9771 int n_res, j;
9772 if (lower)
9773 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9774 else
9775 n_res = _PyUnicode_ToUpperFull(c, mapped);
9776 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009777 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778 res[k++] = mapped[j];
9779 }
9780 }
9781 return k;
9782}
9783
9784static Py_ssize_t
9785do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9786{
9787 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9788}
9789
9790static Py_ssize_t
9791do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9794}
9795
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009797do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799 Py_ssize_t i, k = 0;
9800
9801 for (i = 0; i < length; i++) {
9802 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9803 Py_UCS4 mapped[3];
9804 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9805 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009806 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009807 res[k++] = mapped[j];
9808 }
9809 }
9810 return k;
9811}
9812
9813static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9815{
9816 Py_ssize_t i, k = 0;
9817 int previous_is_cased;
9818
9819 previous_is_cased = 0;
9820 for (i = 0; i < length; i++) {
9821 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9822 Py_UCS4 mapped[3];
9823 int n_res, j;
9824
9825 if (previous_is_cased)
9826 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9827 else
9828 n_res = _PyUnicode_ToTitleFull(c, mapped);
9829
9830 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009831 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009832 res[k++] = mapped[j];
9833 }
9834
9835 previous_is_cased = _PyUnicode_IsCased(c);
9836 }
9837 return k;
9838}
9839
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009840static PyObject *
9841case_operation(PyObject *self,
9842 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9843{
9844 PyObject *res = NULL;
9845 Py_ssize_t length, newlength = 0;
9846 int kind, outkind;
9847 void *data, *outdata;
9848 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9849
Benjamin Petersoneea48462012-01-16 14:28:50 -05009850 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009851
9852 kind = PyUnicode_KIND(self);
9853 data = PyUnicode_DATA(self);
9854 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009855 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009856 PyErr_SetString(PyExc_OverflowError, "string is too long");
9857 return NULL;
9858 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009859 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860 if (tmp == NULL)
9861 return PyErr_NoMemory();
9862 newlength = perform(kind, data, length, tmp, &maxchar);
9863 res = PyUnicode_New(newlength, maxchar);
9864 if (res == NULL)
9865 goto leave;
9866 tmpend = tmp + newlength;
9867 outdata = PyUnicode_DATA(res);
9868 outkind = PyUnicode_KIND(res);
9869 switch (outkind) {
9870 case PyUnicode_1BYTE_KIND:
9871 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9875 break;
9876 case PyUnicode_4BYTE_KIND:
9877 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9878 break;
9879 default:
9880 assert(0);
9881 break;
9882 }
9883 leave:
9884 PyMem_FREE(tmp);
9885 return res;
9886}
9887
Tim Peters8ce9f162004-08-27 01:49:32 +00009888PyObject *
9889PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009891 PyObject *res;
9892 PyObject *fseq;
9893 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009894 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009896 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009897 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009898 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009899 }
9900
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009901 /* NOTE: the following code can't call back into Python code,
9902 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009903 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009904
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009905 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009906 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009907 res = _PyUnicode_JoinArray(separator, items, seqlen);
9908 Py_DECREF(fseq);
9909 return res;
9910}
9911
9912PyObject *
9913_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9914{
9915 PyObject *res = NULL; /* the result */
9916 PyObject *sep = NULL;
9917 Py_ssize_t seplen;
9918 PyObject *item;
9919 Py_ssize_t sz, i, res_offset;
9920 Py_UCS4 maxchar;
9921 Py_UCS4 item_maxchar;
9922 int use_memcpy;
9923 unsigned char *res_data = NULL, *sep_data = NULL;
9924 PyObject *last_obj;
9925 unsigned int kind = 0;
9926
Tim Peters05eba1f2004-08-27 21:32:02 +00009927 /* If empty sequence, return u"". */
9928 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009929 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009930 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009931
Tim Peters05eba1f2004-08-27 21:32:02 +00009932 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009934 if (seqlen == 1) {
9935 if (PyUnicode_CheckExact(items[0])) {
9936 res = items[0];
9937 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009938 return res;
9939 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009940 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009941 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009942 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009943 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009944 /* Set up sep and seplen */
9945 if (separator == NULL) {
9946 /* fall back to a blank space separator */
9947 sep = PyUnicode_FromOrdinal(' ');
9948 if (!sep)
9949 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009950 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009951 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009952 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009953 else {
9954 if (!PyUnicode_Check(separator)) {
9955 PyErr_Format(PyExc_TypeError,
9956 "separator: expected str instance,"
9957 " %.80s found",
9958 Py_TYPE(separator)->tp_name);
9959 goto onError;
9960 }
9961 if (PyUnicode_READY(separator))
9962 goto onError;
9963 sep = separator;
9964 seplen = PyUnicode_GET_LENGTH(separator);
9965 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9966 /* inc refcount to keep this code path symmetric with the
9967 above case of a blank separator */
9968 Py_INCREF(sep);
9969 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009970 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009971 }
9972
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 /* There are at least two things to join, or else we have a subclass
9974 * of str in the sequence.
9975 * Do a pre-pass to figure out the total amount of space we'll
9976 * need (sz), and see whether all argument are strings.
9977 */
9978 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009979#ifdef Py_DEBUG
9980 use_memcpy = 0;
9981#else
9982 use_memcpy = 1;
9983#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009985 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 if (!PyUnicode_Check(item)) {
9988 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009989 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 " %.80s found",
9991 i, Py_TYPE(item)->tp_name);
9992 goto onError;
9993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 if (PyUnicode_READY(item) == -1)
9995 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009996 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009998 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009999 if (i != 0) {
10000 add_sz += seplen;
10001 }
10002 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010005 goto onError;
10006 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010007 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 if (use_memcpy && last_obj != NULL) {
10009 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10010 use_memcpy = 0;
10011 }
10012 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 }
Tim Petersced69f82003-09-16 20:30:58 +000010014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010016 if (res == NULL)
10017 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010018
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010019 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010020#ifdef Py_DEBUG
10021 use_memcpy = 0;
10022#else
10023 if (use_memcpy) {
10024 res_data = PyUnicode_1BYTE_DATA(res);
10025 kind = PyUnicode_KIND(res);
10026 if (seplen != 0)
10027 sep_data = PyUnicode_1BYTE_DATA(sep);
10028 }
10029#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010030 if (use_memcpy) {
10031 for (i = 0; i < seqlen; ++i) {
10032 Py_ssize_t itemlen;
10033 item = items[i];
10034
10035 /* Copy item, and maybe the separator. */
10036 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010037 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010039 kind * seplen);
10040 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010041 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010042
10043 itemlen = PyUnicode_GET_LENGTH(item);
10044 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010045 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010047 kind * itemlen);
10048 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010049 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010050 }
10051 assert(res_data == PyUnicode_1BYTE_DATA(res)
10052 + kind * PyUnicode_GET_LENGTH(res));
10053 }
10054 else {
10055 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10056 Py_ssize_t itemlen;
10057 item = items[i];
10058
10059 /* Copy item, and maybe the separator. */
10060 if (i && seplen != 0) {
10061 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10062 res_offset += seplen;
10063 }
10064
10065 itemlen = PyUnicode_GET_LENGTH(item);
10066 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010067 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 res_offset += itemlen;
10069 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010070 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010071 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010072 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010075 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010080 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081 return NULL;
10082}
10083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084#define FILL(kind, data, value, start, length) \
10085 do { \
10086 Py_ssize_t i_ = 0; \
10087 assert(kind != PyUnicode_WCHAR_KIND); \
10088 switch ((kind)) { \
10089 case PyUnicode_1BYTE_KIND: { \
10090 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010091 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 break; \
10093 } \
10094 case PyUnicode_2BYTE_KIND: { \
10095 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10096 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10097 break; \
10098 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010099 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10101 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10102 break; \
10103 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010104 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 } \
10106 } while (0)
10107
Victor Stinnerd3f08822012-05-29 12:57:52 +020010108void
10109_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10110 Py_UCS4 fill_char)
10111{
10112 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10113 const void *data = PyUnicode_DATA(unicode);
10114 assert(PyUnicode_IS_READY(unicode));
10115 assert(unicode_modifiable(unicode));
10116 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10117 assert(start >= 0);
10118 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10119 FILL(kind, data, fill_char, start, length);
10120}
10121
Victor Stinner3fe55312012-01-04 00:33:50 +010010122Py_ssize_t
10123PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10124 Py_UCS4 fill_char)
10125{
10126 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010127
10128 if (!PyUnicode_Check(unicode)) {
10129 PyErr_BadInternalCall();
10130 return -1;
10131 }
10132 if (PyUnicode_READY(unicode) == -1)
10133 return -1;
10134 if (unicode_check_modifiable(unicode))
10135 return -1;
10136
Victor Stinnerd3f08822012-05-29 12:57:52 +020010137 if (start < 0) {
10138 PyErr_SetString(PyExc_IndexError, "string index out of range");
10139 return -1;
10140 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010141 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10142 PyErr_SetString(PyExc_ValueError,
10143 "fill character is bigger than "
10144 "the string maximum character");
10145 return -1;
10146 }
10147
10148 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10149 length = Py_MIN(maxlen, length);
10150 if (length <= 0)
10151 return 0;
10152
Victor Stinnerd3f08822012-05-29 12:57:52 +020010153 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010154 return length;
10155}
10156
Victor Stinner9310abb2011-10-05 00:59:23 +020010157static PyObject *
10158pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010159 Py_ssize_t left,
10160 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 PyObject *u;
10164 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010165 int kind;
10166 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
10168 if (left < 0)
10169 left = 0;
10170 if (right < 0)
10171 right = 0;
10172
Victor Stinnerc4b49542011-12-11 22:44:26 +010010173 if (left == 0 && right == 0)
10174 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10177 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010178 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10179 return NULL;
10180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010182 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010184 if (!u)
10185 return NULL;
10186
10187 kind = PyUnicode_KIND(u);
10188 data = PyUnicode_DATA(u);
10189 if (left)
10190 FILL(kind, data, fill, 0, left);
10191 if (right)
10192 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010193 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010194 assert(_PyUnicode_CheckConsistency(u, 1));
10195 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196}
10197
Alexander Belopolsky40018472011-02-26 01:02:56 +000010198PyObject *
10199PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010203 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
Benjamin Petersonead6b532011-12-20 17:23:42 -060010206 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 if (PyUnicode_IS_ASCII(string))
10209 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010210 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 PyUnicode_GET_LENGTH(string), keepends);
10212 else
10213 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010214 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 break;
10217 case PyUnicode_2BYTE_KIND:
10218 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 PyUnicode_GET_LENGTH(string), keepends);
10221 break;
10222 case PyUnicode_4BYTE_KIND:
10223 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 PyUnicode_GET_LENGTH(string), keepends);
10226 break;
10227 default:
10228 assert(0);
10229 list = 0;
10230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232}
10233
Alexander Belopolsky40018472011-02-26 01:02:56 +000010234static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010235split(PyObject *self,
10236 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010237 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010239 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 void *buf1, *buf2;
10241 Py_ssize_t len1, len2;
10242 PyObject* out;
10243
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010245 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (PyUnicode_READY(self) == -1)
10248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010251 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010253 if (PyUnicode_IS_ASCII(self))
10254 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010255 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010256 PyUnicode_GET_LENGTH(self), maxcount
10257 );
10258 else
10259 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010261 PyUnicode_GET_LENGTH(self), maxcount
10262 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 case PyUnicode_2BYTE_KIND:
10264 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010265 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 PyUnicode_GET_LENGTH(self), maxcount
10267 );
10268 case PyUnicode_4BYTE_KIND:
10269 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010270 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 PyUnicode_GET_LENGTH(self), maxcount
10272 );
10273 default:
10274 assert(0);
10275 return NULL;
10276 }
10277
10278 if (PyUnicode_READY(substring) == -1)
10279 return NULL;
10280
10281 kind1 = PyUnicode_KIND(self);
10282 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 len1 = PyUnicode_GET_LENGTH(self);
10284 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010285 if (kind1 < kind2 || len1 < len2) {
10286 out = PyList_New(1);
10287 if (out == NULL)
10288 return NULL;
10289 Py_INCREF(self);
10290 PyList_SET_ITEM(out, 0, self);
10291 return out;
10292 }
10293 buf1 = PyUnicode_DATA(self);
10294 buf2 = PyUnicode_DATA(substring);
10295 if (kind2 != kind1) {
10296 buf2 = _PyUnicode_AsKind(substring, kind1);
10297 if (!buf2)
10298 return NULL;
10299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010301 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010303 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10304 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010306 else
10307 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 break;
10310 case PyUnicode_2BYTE_KIND:
10311 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 case PyUnicode_4BYTE_KIND:
10315 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 break;
10318 default:
10319 out = NULL;
10320 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010321 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 PyMem_Free(buf2);
10323 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324}
10325
Alexander Belopolsky40018472011-02-26 01:02:56 +000010326static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010327rsplit(PyObject *self,
10328 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010329 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010330{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010331 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 void *buf1, *buf2;
10333 Py_ssize_t len1, len2;
10334 PyObject* out;
10335
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010336 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010337 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_READY(self) == -1)
10340 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010343 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010345 if (PyUnicode_IS_ASCII(self))
10346 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010347 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 PyUnicode_GET_LENGTH(self), maxcount
10349 );
10350 else
10351 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 PyUnicode_GET_LENGTH(self), maxcount
10354 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 case PyUnicode_2BYTE_KIND:
10356 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 PyUnicode_GET_LENGTH(self), maxcount
10359 );
10360 case PyUnicode_4BYTE_KIND:
10361 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010362 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 PyUnicode_GET_LENGTH(self), maxcount
10364 );
10365 default:
10366 assert(0);
10367 return NULL;
10368 }
10369
10370 if (PyUnicode_READY(substring) == -1)
10371 return NULL;
10372
10373 kind1 = PyUnicode_KIND(self);
10374 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 len1 = PyUnicode_GET_LENGTH(self);
10376 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010377 if (kind1 < kind2 || len1 < len2) {
10378 out = PyList_New(1);
10379 if (out == NULL)
10380 return NULL;
10381 Py_INCREF(self);
10382 PyList_SET_ITEM(out, 0, self);
10383 return out;
10384 }
10385 buf1 = PyUnicode_DATA(self);
10386 buf2 = PyUnicode_DATA(substring);
10387 if (kind2 != kind1) {
10388 buf2 = _PyUnicode_AsKind(substring, kind1);
10389 if (!buf2)
10390 return NULL;
10391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010393 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10396 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010397 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 else
10399 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 break;
10402 case PyUnicode_2BYTE_KIND:
10403 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 case PyUnicode_4BYTE_KIND:
10407 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010408 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 break;
10410 default:
10411 out = NULL;
10412 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010413 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 PyMem_Free(buf2);
10415 return out;
10416}
10417
10418static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10420 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010422 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010424 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10425 return asciilib_find(buf1, len1, buf2, len2, offset);
10426 else
10427 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 case PyUnicode_2BYTE_KIND:
10429 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10430 case PyUnicode_4BYTE_KIND:
10431 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10432 }
10433 assert(0);
10434 return -1;
10435}
10436
10437static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010438anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10439 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010441 switch (kind) {
10442 case PyUnicode_1BYTE_KIND:
10443 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10444 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10445 else
10446 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10447 case PyUnicode_2BYTE_KIND:
10448 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10449 case PyUnicode_4BYTE_KIND:
10450 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10451 }
10452 assert(0);
10453 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010454}
10455
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456static void
10457replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10458 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10459{
10460 int kind = PyUnicode_KIND(u);
10461 void *data = PyUnicode_DATA(u);
10462 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10463 if (kind == PyUnicode_1BYTE_KIND) {
10464 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10465 (Py_UCS1 *)data + len,
10466 u1, u2, maxcount);
10467 }
10468 else if (kind == PyUnicode_2BYTE_KIND) {
10469 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10470 (Py_UCS2 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else {
10474 assert(kind == PyUnicode_4BYTE_KIND);
10475 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10476 (Py_UCS4 *)data + len,
10477 u1, u2, maxcount);
10478 }
10479}
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482replace(PyObject *self, PyObject *str1,
10483 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyObject *u;
10486 char *sbuf = PyUnicode_DATA(self);
10487 char *buf1 = PyUnicode_DATA(str1);
10488 char *buf2 = PyUnicode_DATA(str2);
10489 int srelease = 0, release1 = 0, release2 = 0;
10490 int skind = PyUnicode_KIND(self);
10491 int kind1 = PyUnicode_KIND(str1);
10492 int kind2 = PyUnicode_KIND(str2);
10493 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10494 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10495 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010500 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010502 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
Victor Stinner59de0ee2011-10-07 10:01:28 +020010504 if (str1 == str2)
10505 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10509 if (maxchar < maxchar_str1)
10510 /* substring too wide to be present */
10511 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010512 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10513 /* Replacing str1 with str2 may cause a maxchar reduction in the
10514 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010515 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010516 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010524 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010525 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010526
Victor Stinner69ed0f42013-04-09 21:48:24 +020010527 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010528 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010529 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010531 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010536 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10537 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 }
10539 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 int rkind = skind;
10541 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010542 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (kind1 < rkind) {
10545 /* widen substring */
10546 buf1 = _PyUnicode_AsKind(str1, rkind);
10547 if (!buf1) goto error;
10548 release1 = 1;
10549 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010550 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551 if (i < 0)
10552 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (rkind > kind2) {
10554 /* widen replacement */
10555 buf2 = _PyUnicode_AsKind(str2, rkind);
10556 if (!buf2) goto error;
10557 release2 = 1;
10558 }
10559 else if (rkind < kind2) {
10560 /* widen self and buf1 */
10561 rkind = kind2;
10562 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010563 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 sbuf = _PyUnicode_AsKind(self, rkind);
10565 if (!sbuf) goto error;
10566 srelease = 1;
10567 buf1 = _PyUnicode_AsKind(str1, rkind);
10568 if (!buf1) goto error;
10569 release1 = 1;
10570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 u = PyUnicode_New(slen, maxchar);
10572 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 assert(PyUnicode_KIND(u) == rkind);
10575 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010576
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010577 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583
10584 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 if (i == -1)
10589 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 }
10597 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010599 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 int rkind = skind;
10601 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010604 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 if (n == 0)
10611 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 buf2 = _PyUnicode_AsKind(str2, rkind);
10615 if (!buf2) goto error;
10616 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 rkind = kind2;
10621 sbuf = _PyUnicode_AsKind(self, rkind);
10622 if (!sbuf) goto error;
10623 srelease = 1;
10624 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010625 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 buf1 = _PyUnicode_AsKind(str1, rkind);
10627 if (!buf1) goto error;
10628 release1 = 1;
10629 }
10630 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10631 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010632 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 PyErr_SetString(PyExc_OverflowError,
10634 "replace string is too long");
10635 goto error;
10636 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010637 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010639 _Py_INCREF_UNICODE_EMPTY();
10640 if (!unicode_empty)
10641 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010642 u = unicode_empty;
10643 goto done;
10644 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010645 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 PyErr_SetString(PyExc_OverflowError,
10647 "replace string is too long");
10648 goto error;
10649 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 u = PyUnicode_New(new_size, maxchar);
10651 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010653 assert(PyUnicode_KIND(u) == rkind);
10654 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 ires = i = 0;
10656 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 while (n-- > 0) {
10658 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010662 if (j == -1)
10663 break;
10664 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
10671 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
10683 sbuf + rkind * i,
10684 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 }
10686 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 /* interleave */
10688 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 if (--n <= 0)
10694 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 ires++;
10699 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705 }
10706
10707 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010708 unicode_adjust_maxchar(&u);
10709 if (u == NULL)
10710 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712
10713 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (srelease)
10715 PyMem_FREE(sbuf);
10716 if (release1)
10717 PyMem_FREE(buf1);
10718 if (release2)
10719 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010720 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (srelease)
10726 PyMem_FREE(sbuf);
10727 if (release1)
10728 PyMem_FREE(buf1);
10729 if (release2)
10730 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010731 return unicode_result_unchanged(self);
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 error:
10734 if (srelease && sbuf)
10735 PyMem_FREE(sbuf);
10736 if (release1 && buf1)
10737 PyMem_FREE(buf1);
10738 if (release2 && buf2)
10739 PyMem_FREE(buf2);
10740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741}
10742
10743/* --- Unicode Object Methods --------------------------------------------- */
10744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745/*[clinic input]
10746str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
INADA Naoki3ae20562017-01-16 20:41:20 +090010748Return a version of the string where each word is titlecased.
10749
10750More specifically, words start with uppercased characters and all remaining
10751cased characters have lower case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010756/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010760 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761}
10762
INADA Naoki3ae20562017-01-16 20:41:20 +090010763/*[clinic input]
10764str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
INADA Naoki3ae20562017-01-16 20:41:20 +090010766Return a capitalized version of the string.
10767
10768More specifically, make the first character have upper case and the rest lower
10769case.
10770[clinic start generated code]*/
10771
10772static PyObject *
10773unicode_capitalize_impl(PyObject *self)
10774/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_GET_LENGTH(self) == 0)
10779 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010780 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781}
10782
INADA Naoki3ae20562017-01-16 20:41:20 +090010783/*[clinic input]
10784str.casefold as unicode_casefold
10785
10786Return a version of the string suitable for caseless comparisons.
10787[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010788
10789static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010790unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010791/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792{
10793 if (PyUnicode_READY(self) == -1)
10794 return NULL;
10795 if (PyUnicode_IS_ASCII(self))
10796 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010797 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010798}
10799
10800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010802
10803static int
10804convert_uc(PyObject *obj, void *addr)
10805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010808 if (!PyUnicode_Check(obj)) {
10809 PyErr_Format(PyExc_TypeError,
10810 "The fill character must be a unicode character, "
10811 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 if (PyUnicode_READY(obj) < 0)
10815 return 0;
10816 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 return 0;
10820 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010821 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010822 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010823}
10824
INADA Naoki3ae20562017-01-16 20:41:20 +090010825/*[clinic input]
10826str.center as unicode_center
10827
10828 width: Py_ssize_t
10829 fillchar: Py_UCS4 = ' '
10830 /
10831
10832Return a centered string of length width.
10833
10834Padding is done using the specified fill character (default is a space).
10835[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010838unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10839/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010841 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Benjamin Petersonbac79492012-01-14 13:34:47 -050010843 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
10845
Victor Stinnerc4b49542011-12-11 22:44:26 +010010846 if (PyUnicode_GET_LENGTH(self) >= width)
10847 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848
Victor Stinnerc4b49542011-12-11 22:44:26 +010010849 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 left = marg / 2 + (marg & width & 1);
10851
Victor Stinner9310abb2011-10-05 00:59:23 +020010852 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853}
10854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855/* This function assumes that str1 and str2 are readied by the caller. */
10856
Marc-André Lemburge5034372000-08-08 08:04:29 +000010857static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010858unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010860#define COMPARE(TYPE1, TYPE2) \
10861 do { \
10862 TYPE1* p1 = (TYPE1 *)data1; \
10863 TYPE2* p2 = (TYPE2 *)data2; \
10864 TYPE1* end = p1 + len; \
10865 Py_UCS4 c1, c2; \
10866 for (; p1 != end; p1++, p2++) { \
10867 c1 = *p1; \
10868 c2 = *p2; \
10869 if (c1 != c2) \
10870 return (c1 < c2) ? -1 : 1; \
10871 } \
10872 } \
10873 while (0)
10874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 int kind1, kind2;
10876 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 kind1 = PyUnicode_KIND(str1);
10880 kind2 = PyUnicode_KIND(str2);
10881 data1 = PyUnicode_DATA(str1);
10882 data2 = PyUnicode_DATA(str2);
10883 len1 = PyUnicode_GET_LENGTH(str1);
10884 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010885 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010886
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 switch(kind1) {
10888 case PyUnicode_1BYTE_KIND:
10889 {
10890 switch(kind2) {
10891 case PyUnicode_1BYTE_KIND:
10892 {
10893 int cmp = memcmp(data1, data2, len);
10894 /* normalize result of memcmp() into the range [-1; 1] */
10895 if (cmp < 0)
10896 return -1;
10897 if (cmp > 0)
10898 return 1;
10899 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010900 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS1, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
10905 COMPARE(Py_UCS1, Py_UCS4);
10906 break;
10907 default:
10908 assert(0);
10909 }
10910 break;
10911 }
10912 case PyUnicode_2BYTE_KIND:
10913 {
10914 switch(kind2) {
10915 case PyUnicode_1BYTE_KIND:
10916 COMPARE(Py_UCS2, Py_UCS1);
10917 break;
10918 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010919 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 COMPARE(Py_UCS2, Py_UCS2);
10921 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010922 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010923 case PyUnicode_4BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS4);
10925 break;
10926 default:
10927 assert(0);
10928 }
10929 break;
10930 }
10931 case PyUnicode_4BYTE_KIND:
10932 {
10933 switch(kind2) {
10934 case PyUnicode_1BYTE_KIND:
10935 COMPARE(Py_UCS4, Py_UCS1);
10936 break;
10937 case PyUnicode_2BYTE_KIND:
10938 COMPARE(Py_UCS4, Py_UCS2);
10939 break;
10940 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010941 {
10942#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10943 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10944 /* normalize result of wmemcmp() into the range [-1; 1] */
10945 if (cmp < 0)
10946 return -1;
10947 if (cmp > 0)
10948 return 1;
10949#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010951#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010953 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 default:
10955 assert(0);
10956 }
10957 break;
10958 }
10959 default:
10960 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961 }
10962
Victor Stinner770e19e2012-10-04 22:59:45 +020010963 if (len1 == len2)
10964 return 0;
10965 if (len1 < len2)
10966 return -1;
10967 else
10968 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010969
10970#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010971}
10972
Benjamin Peterson621b4302016-09-09 13:54:34 -070010973static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974unicode_compare_eq(PyObject *str1, PyObject *str2)
10975{
10976 int kind;
10977 void *data1, *data2;
10978 Py_ssize_t len;
10979 int cmp;
10980
Victor Stinnere5567ad2012-10-23 02:48:49 +020010981 len = PyUnicode_GET_LENGTH(str1);
10982 if (PyUnicode_GET_LENGTH(str2) != len)
10983 return 0;
10984 kind = PyUnicode_KIND(str1);
10985 if (PyUnicode_KIND(str2) != kind)
10986 return 0;
10987 data1 = PyUnicode_DATA(str1);
10988 data2 = PyUnicode_DATA(str2);
10989
10990 cmp = memcmp(data1, data2, len * kind);
10991 return (cmp == 0);
10992}
10993
10994
Alexander Belopolsky40018472011-02-26 01:02:56 +000010995int
10996PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10999 if (PyUnicode_READY(left) == -1 ||
11000 PyUnicode_READY(right) == -1)
11001 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011002
11003 /* a string is equal to itself */
11004 if (left == right)
11005 return 0;
11006
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011009 PyErr_Format(PyExc_TypeError,
11010 "Can't compare %.100s and %.100s",
11011 left->ob_type->tp_name,
11012 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return -1;
11014}
11015
Martin v. Löwis5b222132007-06-10 09:51:05 +000011016int
11017PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_ssize_t i;
11020 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011022 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Victor Stinner910337b2011-10-03 03:20:16 +020011024 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011025 if (!PyUnicode_IS_READY(uni)) {
11026 const wchar_t *ws = _PyUnicode_WSTR(uni);
11027 /* Compare Unicode string and source character set string */
11028 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11029 if (chr != ustr[i])
11030 return (chr < ustr[i]) ? -1 : 1;
11031 }
11032 /* This check keeps Python strings that end in '\0' from comparing equal
11033 to C strings identical up to that point. */
11034 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11035 return 1; /* uni is longer */
11036 if (ustr[i])
11037 return -1; /* str is longer */
11038 return 0;
11039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011041 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011042 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011043 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011044 size_t len, len2 = strlen(str);
11045 int cmp;
11046
11047 len = Py_MIN(len1, len2);
11048 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011049 if (cmp != 0) {
11050 if (cmp < 0)
11051 return -1;
11052 else
11053 return 1;
11054 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011055 if (len1 > len2)
11056 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011057 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011058 return -1; /* str is longer */
11059 return 0;
11060 }
11061 else {
11062 void *data = PyUnicode_DATA(uni);
11063 /* Compare Unicode string and source character set string */
11064 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011065 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011066 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11067 /* This check keeps Python strings that end in '\0' from comparing equal
11068 to C strings identical up to that point. */
11069 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11070 return 1; /* uni is longer */
11071 if (str[i])
11072 return -1; /* str is longer */
11073 return 0;
11074 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011075}
11076
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011077static int
11078non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11079{
11080 size_t i, len;
11081 const wchar_t *p;
11082 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11083 if (strlen(str) != len)
11084 return 0;
11085 p = _PyUnicode_WSTR(unicode);
11086 assert(p);
11087 for (i = 0; i < len; i++) {
11088 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011089 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011090 return 0;
11091 }
11092 return 1;
11093}
11094
11095int
11096_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11097{
11098 size_t len;
11099 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011100 assert(str);
11101#ifndef NDEBUG
11102 for (const char *p = str; *p; p++) {
11103 assert((unsigned char)*p < 128);
11104 }
11105#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011106 if (PyUnicode_READY(unicode) == -1) {
11107 /* Memory error or bad data */
11108 PyErr_Clear();
11109 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11110 }
11111 if (!PyUnicode_IS_ASCII(unicode))
11112 return 0;
11113 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11114 return strlen(str) == len &&
11115 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116}
11117
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011118int
11119_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11120{
11121 PyObject *right_uni;
11122 Py_hash_t hash;
11123
11124 assert(_PyUnicode_CHECK(left));
11125 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011126#ifndef NDEBUG
11127 for (const char *p = right->string; *p; p++) {
11128 assert((unsigned char)*p < 128);
11129 }
11130#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011131
11132 if (PyUnicode_READY(left) == -1) {
11133 /* memory error or bad data */
11134 PyErr_Clear();
11135 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11136 }
11137
11138 if (!PyUnicode_IS_ASCII(left))
11139 return 0;
11140
11141 right_uni = _PyUnicode_FromId(right); /* borrowed */
11142 if (right_uni == NULL) {
11143 /* memory error or bad data */
11144 PyErr_Clear();
11145 return _PyUnicode_EqualToASCIIString(left, right->string);
11146 }
11147
11148 if (left == right_uni)
11149 return 1;
11150
11151 if (PyUnicode_CHECK_INTERNED(left))
11152 return 0;
11153
11154 assert(_PyUnicode_HASH(right_uni) != 1);
11155 hash = _PyUnicode_HASH(left);
11156 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11157 return 0;
11158
11159 return unicode_compare_eq(left, right_uni);
11160}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011161
Benjamin Peterson29060642009-01-31 22:14:21 +000011162#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011164
Alexander Belopolsky40018472011-02-26 01:02:56 +000011165PyObject *
11166PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011167{
11168 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011169 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011170
Victor Stinnere5567ad2012-10-23 02:48:49 +020011171 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11172 Py_RETURN_NOTIMPLEMENTED;
11173
11174 if (PyUnicode_READY(left) == -1 ||
11175 PyUnicode_READY(right) == -1)
11176 return NULL;
11177
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011178 if (left == right) {
11179 switch (op) {
11180 case Py_EQ:
11181 case Py_LE:
11182 case Py_GE:
11183 /* a string is equal to itself */
11184 v = Py_True;
11185 break;
11186 case Py_NE:
11187 case Py_LT:
11188 case Py_GT:
11189 v = Py_False;
11190 break;
11191 default:
11192 PyErr_BadArgument();
11193 return NULL;
11194 }
11195 }
11196 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011197 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011198 result ^= (op == Py_NE);
11199 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011200 }
11201 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011202 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011203
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011204 /* Convert the return value to a Boolean */
11205 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011206 case Py_LE:
11207 v = TEST_COND(result <= 0);
11208 break;
11209 case Py_GE:
11210 v = TEST_COND(result >= 0);
11211 break;
11212 case Py_LT:
11213 v = TEST_COND(result == -1);
11214 break;
11215 case Py_GT:
11216 v = TEST_COND(result == 1);
11217 break;
11218 default:
11219 PyErr_BadArgument();
11220 return NULL;
11221 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011222 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011223 Py_INCREF(v);
11224 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011225}
11226
Alexander Belopolsky40018472011-02-26 01:02:56 +000011227int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011228_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11229{
11230 return unicode_eq(aa, bb);
11231}
11232
11233int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011235{
Victor Stinner77282cb2013-04-14 19:22:47 +020011236 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 void *buf1, *buf2;
11238 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011239 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011240
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011241 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 "'in <string>' requires string as left operand, not %.100s",
11244 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011245 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011246 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011248 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011249 if (ensure_unicode(str) < 0)
11250 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011253 kind2 = PyUnicode_KIND(substr);
11254 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011255 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011257 len2 = PyUnicode_GET_LENGTH(substr);
11258 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011259 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011260 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011261 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011262 if (len2 == 1) {
11263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11264 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011265 return result;
11266 }
11267 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011268 buf2 = _PyUnicode_AsKind(substr, kind1);
11269 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011270 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272
Victor Stinner77282cb2013-04-14 19:22:47 +020011273 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 case PyUnicode_1BYTE_KIND:
11275 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11276 break;
11277 case PyUnicode_2BYTE_KIND:
11278 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11279 break;
11280 case PyUnicode_4BYTE_KIND:
11281 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11282 break;
11283 default:
11284 result = -1;
11285 assert(0);
11286 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011287
Victor Stinner77282cb2013-04-14 19:22:47 +020011288 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 PyMem_Free(buf2);
11290
Guido van Rossum403d68b2000-03-13 15:55:09 +000011291 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011292}
11293
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294/* Concat to string or Unicode object giving a new Unicode object. */
11295
Alexander Belopolsky40018472011-02-26 01:02:56 +000011296PyObject *
11297PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011300 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011303 if (ensure_unicode(left) < 0)
11304 return NULL;
11305
11306 if (!PyUnicode_Check(right)) {
11307 PyErr_Format(PyExc_TypeError,
11308 "can only concatenate str (not \"%.200s\") to str",
11309 right->ob_type->tp_name);
11310 return NULL;
11311 }
11312 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011316 if (left == unicode_empty)
11317 return PyUnicode_FromObject(right);
11318 if (right == unicode_empty)
11319 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011321 left_len = PyUnicode_GET_LENGTH(left);
11322 right_len = PyUnicode_GET_LENGTH(right);
11323 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011324 PyErr_SetString(PyExc_OverflowError,
11325 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011326 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011327 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011328 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011329
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011330 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11331 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011332 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011335 result = PyUnicode_New(new_len, maxchar);
11336 if (result == NULL)
11337 return NULL;
11338 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11339 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11340 assert(_PyUnicode_CheckConsistency(result, 1));
11341 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Walter Dörwald1ab83302007-05-18 17:15:44 +000011344void
Victor Stinner23e56682011-10-03 03:54:37 +020011345PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011346{
Victor Stinner23e56682011-10-03 03:54:37 +020011347 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011348 Py_UCS4 maxchar, maxchar2;
11349 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011350
11351 if (p_left == NULL) {
11352 if (!PyErr_Occurred())
11353 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011354 return;
11355 }
Victor Stinner23e56682011-10-03 03:54:37 +020011356 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011357 if (right == NULL || left == NULL
11358 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011359 if (!PyErr_Occurred())
11360 PyErr_BadInternalCall();
11361 goto error;
11362 }
11363
Benjamin Petersonbac79492012-01-14 13:34:47 -050011364 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011365 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011366 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011367 goto error;
11368
Victor Stinner488fa492011-12-12 00:01:39 +010011369 /* Shortcuts */
11370 if (left == unicode_empty) {
11371 Py_DECREF(left);
11372 Py_INCREF(right);
11373 *p_left = right;
11374 return;
11375 }
11376 if (right == unicode_empty)
11377 return;
11378
11379 left_len = PyUnicode_GET_LENGTH(left);
11380 right_len = PyUnicode_GET_LENGTH(right);
11381 if (left_len > PY_SSIZE_T_MAX - right_len) {
11382 PyErr_SetString(PyExc_OverflowError,
11383 "strings are too large to concat");
11384 goto error;
11385 }
11386 new_len = left_len + right_len;
11387
11388 if (unicode_modifiable(left)
11389 && PyUnicode_CheckExact(right)
11390 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011391 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11392 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011393 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011394 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011395 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11396 {
11397 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011398 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011399 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011400
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011401 /* copy 'right' into the newly allocated area of 'left' */
11402 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011403 }
Victor Stinner488fa492011-12-12 00:01:39 +010011404 else {
11405 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11406 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011407 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011408
Victor Stinner488fa492011-12-12 00:01:39 +010011409 /* Concat the two Unicode strings */
11410 res = PyUnicode_New(new_len, maxchar);
11411 if (res == NULL)
11412 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011413 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11414 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011415 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011416 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011417 }
11418 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011419 return;
11420
11421error:
Victor Stinner488fa492011-12-12 00:01:39 +010011422 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011423}
11424
11425void
11426PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011428 PyUnicode_Append(pleft, right);
11429 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011430}
11431
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011432/*
11433Wraps stringlib_parse_args_finds() and additionally ensures that the
11434first argument is a unicode object.
11435*/
11436
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011437static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011438parse_args_finds_unicode(const char * function_name, PyObject *args,
11439 PyObject **substring,
11440 Py_ssize_t *start, Py_ssize_t *end)
11441{
11442 if(stringlib_parse_args_finds(function_name, args, substring,
11443 start, end)) {
11444 if (ensure_unicode(*substring) < 0)
11445 return 0;
11446 return 1;
11447 }
11448 return 0;
11449}
11450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011451PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011454Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011455string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
11458static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011459unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011461 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011462 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011463 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011465 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 void *buf1, *buf2;
11467 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011469 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 kind1 = PyUnicode_KIND(self);
11473 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011474 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011475 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 len1 = PyUnicode_GET_LENGTH(self);
11478 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011481 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011482
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011483 buf1 = PyUnicode_DATA(self);
11484 buf2 = PyUnicode_DATA(substring);
11485 if (kind2 != kind1) {
11486 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011487 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011488 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011489 }
11490 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 case PyUnicode_1BYTE_KIND:
11492 iresult = ucs1lib_count(
11493 ((Py_UCS1*)buf1) + start, end - start,
11494 buf2, len2, PY_SSIZE_T_MAX
11495 );
11496 break;
11497 case PyUnicode_2BYTE_KIND:
11498 iresult = ucs2lib_count(
11499 ((Py_UCS2*)buf1) + start, end - start,
11500 buf2, len2, PY_SSIZE_T_MAX
11501 );
11502 break;
11503 case PyUnicode_4BYTE_KIND:
11504 iresult = ucs4lib_count(
11505 ((Py_UCS4*)buf1) + start, end - start,
11506 buf2, len2, PY_SSIZE_T_MAX
11507 );
11508 break;
11509 default:
11510 assert(0); iresult = 0;
11511 }
11512
11513 result = PyLong_FromSsize_t(iresult);
11514
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011515 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 return result;
11519}
11520
INADA Naoki3ae20562017-01-16 20:41:20 +090011521/*[clinic input]
11522str.encode as unicode_encode
11523
11524 encoding: str(c_default="NULL") = 'utf-8'
11525 The encoding in which to encode the string.
11526 errors: str(c_default="NULL") = 'strict'
11527 The error handling scheme to use for encoding errors.
11528 The default is 'strict' meaning that encoding errors raise a
11529 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11530 'xmlcharrefreplace' as well as any other name registered with
11531 codecs.register_error that can handle UnicodeEncodeErrors.
11532
11533Encode the string using the codec registered for encoding.
11534[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
11536static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011537unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011538/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011540 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011541}
11542
INADA Naoki3ae20562017-01-16 20:41:20 +090011543/*[clinic input]
11544str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
INADA Naoki3ae20562017-01-16 20:41:20 +090011546 tabsize: int = 8
11547
11548Return a copy where all tab characters are expanded using spaces.
11549
11550If tabsize is not given, a tab size of 8 characters is assumed.
11551[clinic start generated code]*/
11552
11553static PyObject *
11554unicode_expandtabs_impl(PyObject *self, int tabsize)
11555/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011557 Py_ssize_t i, j, line_pos, src_len, incr;
11558 Py_UCS4 ch;
11559 PyObject *u;
11560 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011562 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
Antoine Pitrou22425222011-10-04 19:10:51 +020011564 if (PyUnicode_READY(self) == -1)
11565 return NULL;
11566
Thomas Wouters7e474022000-07-16 12:04:32 +000011567 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 src_len = PyUnicode_GET_LENGTH(self);
11569 i = j = line_pos = 0;
11570 kind = PyUnicode_KIND(self);
11571 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011572 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011573 for (; i < src_len; i++) {
11574 ch = PyUnicode_READ(kind, src_data, i);
11575 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011576 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 goto overflow;
11581 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011583 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 goto overflow;
11588 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011590 if (ch == '\n' || ch == '\r')
11591 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011594 if (!found)
11595 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011596
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011598 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 if (!u)
11600 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
Antoine Pitroue71d5742011-10-04 15:55:09 +020011603 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
Antoine Pitroue71d5742011-10-04 15:55:09 +020011605 for (; i < src_len; i++) {
11606 ch = PyUnicode_READ(kind, src_data, i);
11607 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 incr = tabsize - (line_pos % tabsize);
11610 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011611 FILL(kind, dest_data, ' ', j, incr);
11612 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011614 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011616 line_pos++;
11617 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011618 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011619 if (ch == '\n' || ch == '\r')
11620 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011622 }
11623 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011624 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011625
Antoine Pitroue71d5742011-10-04 15:55:09 +020011626 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011627 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629}
11630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633\n\
11634Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011635such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636arguments start and end are interpreted as in slice notation.\n\
11637\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011638Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639
11640static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011643 /* initialize variables to prevent gcc warning */
11644 PyObject *substring = NULL;
11645 Py_ssize_t start = 0;
11646 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011649 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011652 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011655 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 if (result == -2)
11658 return NULL;
11659
Christian Heimes217cfd12007-12-02 14:31:20 +000011660 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661}
11662
11663static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011664unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011666 void *data;
11667 enum PyUnicode_Kind kind;
11668 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011669
11670 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11671 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011673 }
11674 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11675 PyErr_SetString(PyExc_IndexError, "string index out of range");
11676 return NULL;
11677 }
11678 kind = PyUnicode_KIND(self);
11679 data = PyUnicode_DATA(self);
11680 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011681 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682}
11683
Guido van Rossumc2504932007-09-18 19:42:40 +000011684/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011685 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011686static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011687unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688{
Guido van Rossumc2504932007-09-18 19:42:40 +000011689 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011690 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011691
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011692#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011693 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011694#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (_PyUnicode_HASH(self) != -1)
11696 return _PyUnicode_HASH(self);
11697 if (PyUnicode_READY(self) == -1)
11698 return -1;
11699 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011700 /*
11701 We make the hash of the empty string be 0, rather than using
11702 (prefix ^ suffix), since this slightly obfuscates the hash secret
11703 */
11704 if (len == 0) {
11705 _PyUnicode_HASH(self) = 0;
11706 return 0;
11707 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011708 x = _Py_HashBytes(PyUnicode_DATA(self),
11709 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011711 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712}
11713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011717Return the lowest index in S where substring sub is found, \n\
11718such that sub is contained within S[start:end]. Optional\n\
11719arguments start and end are interpreted as in slice notation.\n\
11720\n\
11721Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
11723static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011726 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011727 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011728 PyObject *substring = NULL;
11729 Py_ssize_t start = 0;
11730 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011732 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011735 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011738 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (result == -2)
11741 return NULL;
11742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 if (result < 0) {
11744 PyErr_SetString(PyExc_ValueError, "substring not found");
11745 return NULL;
11746 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011747
Christian Heimes217cfd12007-12-02 14:31:20 +000011748 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749}
11750
INADA Naoki3ae20562017-01-16 20:41:20 +090011751/*[clinic input]
11752str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
INADA Naoki3ae20562017-01-16 20:41:20 +090011754Return True if the string is a lowercase string, False otherwise.
11755
11756A string is lowercase if all cased characters in the string are lowercase and
11757there is at least one cased character in the string.
11758[clinic start generated code]*/
11759
11760static PyObject *
11761unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011762/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 Py_ssize_t i, length;
11765 int kind;
11766 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 int cased;
11768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771 length = PyUnicode_GET_LENGTH(self);
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
11774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (length == 1)
11777 return PyBool_FromLong(
11778 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011780 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011782 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011787
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011789 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 else if (!cased && Py_UNICODE_ISLOWER(ch))
11791 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794}
11795
INADA Naoki3ae20562017-01-16 20:41:20 +090011796/*[clinic input]
11797str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
INADA Naoki3ae20562017-01-16 20:41:20 +090011799Return True if the string is an uppercase string, False otherwise.
11800
11801A string is uppercase if all cased characters in the string are uppercase and
11802there is at least one cased character in the string.
11803[clinic start generated code]*/
11804
11805static PyObject *
11806unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011807/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 Py_ssize_t i, length;
11810 int kind;
11811 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 int cased;
11813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
11819
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 1)
11822 return PyBool_FromLong(
11823 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011825 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011827 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011828
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 for (i = 0; i < length; i++) {
11831 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011832
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011834 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 else if (!cased && Py_UNICODE_ISUPPER(ch))
11836 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011838 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839}
11840
INADA Naoki3ae20562017-01-16 20:41:20 +090011841/*[clinic input]
11842str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
INADA Naoki3ae20562017-01-16 20:41:20 +090011844Return True if the string is a title-cased string, False otherwise.
11845
11846In a title-cased string, upper- and title-case characters may only
11847follow uncased characters and lowercase characters only cased ones.
11848[clinic start generated code]*/
11849
11850static PyObject *
11851unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011852/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 Py_ssize_t i, length;
11855 int kind;
11856 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 int cased, previous_is_cased;
11858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (PyUnicode_READY(self) == -1)
11860 return NULL;
11861 length = PyUnicode_GET_LENGTH(self);
11862 kind = PyUnicode_KIND(self);
11863 data = PyUnicode_DATA(self);
11864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (length == 1) {
11867 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11868 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11869 (Py_UNICODE_ISUPPER(ch) != 0));
11870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011872 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011874 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011875
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 cased = 0;
11877 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 for (i = 0; i < length; i++) {
11879 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011880
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11882 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011883 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 previous_is_cased = 1;
11885 cased = 1;
11886 }
11887 else if (Py_UNICODE_ISLOWER(ch)) {
11888 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011889 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 previous_is_cased = 1;
11891 cased = 1;
11892 }
11893 else
11894 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011896 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897}
11898
INADA Naoki3ae20562017-01-16 20:41:20 +090011899/*[clinic input]
11900str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
INADA Naoki3ae20562017-01-16 20:41:20 +090011902Return True if the string is a whitespace string, False otherwise.
11903
11904A string is whitespace if all characters in the string are whitespace and there
11905is at least one character in the string.
11906[clinic start generated code]*/
11907
11908static PyObject *
11909unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011910/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 Py_ssize_t i, length;
11913 int kind;
11914 void *data;
11915
11916 if (PyUnicode_READY(self) == -1)
11917 return NULL;
11918 length = PyUnicode_GET_LENGTH(self);
11919 kind = PyUnicode_KIND(self);
11920 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (length == 1)
11924 return PyBool_FromLong(
11925 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011927 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 for (i = 0; i < length; i++) {
11932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011933 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937}
11938
INADA Naoki3ae20562017-01-16 20:41:20 +090011939/*[clinic input]
11940str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941
INADA Naoki3ae20562017-01-16 20:41:20 +090011942Return True if the string is an alphabetic string, False otherwise.
11943
11944A string is alphabetic if all characters in the string are alphabetic and there
11945is at least one character in the string.
11946[clinic start generated code]*/
11947
11948static PyObject *
11949unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011950/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 Py_ssize_t i, length;
11953 int kind;
11954 void *data;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
11967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011969 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 for (i = 0; i < length; i++) {
11972 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011973 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011974 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011975 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976}
11977
INADA Naoki3ae20562017-01-16 20:41:20 +090011978/*[clinic input]
11979str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981Return True if the string is an alpha-numeric string, False otherwise.
11982
11983A string is alpha-numeric if all characters in the string are alpha-numeric and
11984there is at least one character in the string.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011989/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 int kind;
11992 void *data;
11993 Py_ssize_t len, i;
11994
11995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997
11998 kind = PyUnicode_KIND(self);
11999 data = PyUnicode_DATA(self);
12000 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (len == 1) {
12004 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12005 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12006 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012007
12008 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012010 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 for (i = 0; i < len; i++) {
12013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012014 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012016 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012018}
12019
INADA Naoki3ae20562017-01-16 20:41:20 +090012020/*[clinic input]
12021str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
INADA Naoki3ae20562017-01-16 20:41:20 +090012023Return True if the string is a decimal string, False otherwise.
12024
12025A string is a decimal string if all characters in the string are decimal and
12026there is at least one character in the string.
12027[clinic start generated code]*/
12028
12029static PyObject *
12030unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012031/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 Py_ssize_t i, length;
12034 int kind;
12035 void *data;
12036
12037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 length = PyUnicode_GET_LENGTH(self);
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (length == 1)
12045 return PyBool_FromLong(
12046 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012048 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012050 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012054 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057}
12058
INADA Naoki3ae20562017-01-16 20:41:20 +090012059/*[clinic input]
12060str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
INADA Naoki3ae20562017-01-16 20:41:20 +090012062Return True if the string is a digit string, False otherwise.
12063
12064A string is a digit string if all characters in the string are digits and there
12065is at least one character in the string.
12066[clinic start generated code]*/
12067
12068static PyObject *
12069unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012070/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 Py_ssize_t i, length;
12073 int kind;
12074 void *data;
12075
12076 if (PyUnicode_READY(self) == -1)
12077 return NULL;
12078 length = PyUnicode_GET_LENGTH(self);
12079 kind = PyUnicode_KIND(self);
12080 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (length == 1) {
12084 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12085 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012088 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012090 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 for (i = 0; i < length; i++) {
12093 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012094 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012096 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097}
12098
INADA Naoki3ae20562017-01-16 20:41:20 +090012099/*[clinic input]
12100str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
INADA Naoki3ae20562017-01-16 20:41:20 +090012102Return True if the string is a numeric string, False otherwise.
12103
12104A string is numeric if all characters in the string are numeric and there is at
12105least one character in the string.
12106[clinic start generated code]*/
12107
12108static PyObject *
12109unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012110/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 Py_ssize_t i, length;
12113 int kind;
12114 void *data;
12115
12116 if (PyUnicode_READY(self) == -1)
12117 return NULL;
12118 length = PyUnicode_GET_LENGTH(self);
12119 kind = PyUnicode_KIND(self);
12120 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (length == 1)
12124 return PyBool_FromLong(
12125 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012127 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 for (i = 0; i < length; i++) {
12132 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012133 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012135 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Martin v. Löwis47383402007-08-15 07:32:56 +000012138int
12139PyUnicode_IsIdentifier(PyObject *self)
12140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 int kind;
12142 void *data;
12143 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012144 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 if (PyUnicode_READY(self) == -1) {
12147 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 }
12150
12151 /* Special case for empty strings */
12152 if (PyUnicode_GET_LENGTH(self) == 0)
12153 return 0;
12154 kind = PyUnicode_KIND(self);
12155 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012156
12157 /* PEP 3131 says that the first character must be in
12158 XID_Start and subsequent characters in XID_Continue,
12159 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012160 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012161 letters, digits, underscore). However, given the current
12162 definition of XID_Start and XID_Continue, it is sufficient
12163 to check just for these, except that _ must be allowed
12164 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012166 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012167 return 0;
12168
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012169 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012172 return 1;
12173}
12174
INADA Naoki3ae20562017-01-16 20:41:20 +090012175/*[clinic input]
12176str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012177
INADA Naoki3ae20562017-01-16 20:41:20 +090012178Return True if the string is a valid Python identifier, False otherwise.
12179
12180Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12181"class".
12182[clinic start generated code]*/
12183
12184static PyObject *
12185unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012186/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012187{
12188 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12189}
12190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191/*[clinic input]
12192str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194Return True if the string is printable, False otherwise.
12195
12196A string is printable if all of its characters are considered printable in
12197repr() or if it is empty.
12198[clinic start generated code]*/
12199
12200static PyObject *
12201unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012202/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 Py_ssize_t i, length;
12205 int kind;
12206 void *data;
12207
12208 if (PyUnicode_READY(self) == -1)
12209 return NULL;
12210 length = PyUnicode_GET_LENGTH(self);
12211 kind = PyUnicode_KIND(self);
12212 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012213
12214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (length == 1)
12216 return PyBool_FromLong(
12217 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 for (i = 0; i < length; i++) {
12220 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012221 Py_RETURN_FALSE;
12222 }
12223 }
12224 Py_RETURN_TRUE;
12225}
12226
INADA Naoki3ae20562017-01-16 20:41:20 +090012227/*[clinic input]
12228str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
INADA Naoki3ae20562017-01-16 20:41:20 +090012230 iterable: object
12231 /
12232
12233Concatenate any number of strings.
12234
Martin Panter91a88662017-01-24 00:30:06 +000012235The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012236The result is returned as a new string.
12237
12238Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12239[clinic start generated code]*/
12240
12241static PyObject *
12242unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012243/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
INADA Naoki3ae20562017-01-16 20:41:20 +090012245 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246}
12247
Martin v. Löwis18e16552006-02-15 17:27:45 +000012248static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012249unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 if (PyUnicode_READY(self) == -1)
12252 return -1;
12253 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254}
12255
INADA Naoki3ae20562017-01-16 20:41:20 +090012256/*[clinic input]
12257str.ljust as unicode_ljust
12258
12259 width: Py_ssize_t
12260 fillchar: Py_UCS4 = ' '
12261 /
12262
12263Return a left-justified string of length width.
12264
12265Padding is done using the specified fill character (default is a space).
12266[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012269unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12270/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012272 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
Victor Stinnerc4b49542011-12-11 22:44:26 +010012275 if (PyUnicode_GET_LENGTH(self) >= width)
12276 return unicode_result_unchanged(self);
12277
12278 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281/*[clinic input]
12282str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284Return a copy of the string converted to lowercase.
12285[clinic start generated code]*/
12286
12287static PyObject *
12288unicode_lower_impl(PyObject *self)
12289/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 if (PyUnicode_IS_ASCII(self))
12294 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012295 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296}
12297
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298#define LEFTSTRIP 0
12299#define RIGHTSTRIP 1
12300#define BOTHSTRIP 2
12301
12302/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012303static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
INADA Naoki3ae20562017-01-16 20:41:20 +090012305#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012306
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307/* externally visible for str.strip(unicode) */
12308PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012309_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 void *data;
12312 int kind;
12313 Py_ssize_t i, j, len;
12314 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012315 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12318 return NULL;
12319
12320 kind = PyUnicode_KIND(self);
12321 data = PyUnicode_DATA(self);
12322 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012323 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12325 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012326 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 i = 0;
12329 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012330 while (i < len) {
12331 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12332 if (!BLOOM(sepmask, ch))
12333 break;
12334 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12335 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 i++;
12337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012338 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 j = len;
12341 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012342 j--;
12343 while (j >= i) {
12344 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12345 if (!BLOOM(sepmask, ch))
12346 break;
12347 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12348 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012350 }
12351
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012353 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012354
Victor Stinner7931d9a2011-11-04 00:22:48 +010012355 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356}
12357
12358PyObject*
12359PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12360{
12361 unsigned char *data;
12362 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012363 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364
Victor Stinnerde636f32011-10-01 03:55:54 +020012365 if (PyUnicode_READY(self) == -1)
12366 return NULL;
12367
Victor Stinner684d5fd2012-05-03 02:32:34 +020012368 length = PyUnicode_GET_LENGTH(self);
12369 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012370
Victor Stinner684d5fd2012-05-03 02:32:34 +020012371 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012372 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373
Victor Stinnerde636f32011-10-01 03:55:54 +020012374 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012375 PyErr_SetString(PyExc_IndexError, "string index out of range");
12376 return NULL;
12377 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012378 if (start >= length || end < start)
12379 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012380
Victor Stinner684d5fd2012-05-03 02:32:34 +020012381 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012382 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012383 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012384 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012385 }
12386 else {
12387 kind = PyUnicode_KIND(self);
12388 data = PyUnicode_1BYTE_DATA(self);
12389 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012390 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012391 length);
12392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
12395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012396do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 Py_ssize_t len, i, j;
12399
12400 if (PyUnicode_READY(self) == -1)
12401 return NULL;
12402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404
Victor Stinnercc7af722013-04-09 22:39:24 +020012405 if (PyUnicode_IS_ASCII(self)) {
12406 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12407
12408 i = 0;
12409 if (striptype != RIGHTSTRIP) {
12410 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012411 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012412 if (!_Py_ascii_whitespace[ch])
12413 break;
12414 i++;
12415 }
12416 }
12417
12418 j = len;
12419 if (striptype != LEFTSTRIP) {
12420 j--;
12421 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012422 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012423 if (!_Py_ascii_whitespace[ch])
12424 break;
12425 j--;
12426 }
12427 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 }
12429 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012430 else {
12431 int kind = PyUnicode_KIND(self);
12432 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
Victor Stinnercc7af722013-04-09 22:39:24 +020012434 i = 0;
12435 if (striptype != RIGHTSTRIP) {
12436 while (i < len) {
12437 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12438 if (!Py_UNICODE_ISSPACE(ch))
12439 break;
12440 i++;
12441 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012442 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012443
12444 j = len;
12445 if (striptype != LEFTSTRIP) {
12446 j--;
12447 while (j >= i) {
12448 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12449 if (!Py_UNICODE_ISSPACE(ch))
12450 break;
12451 j--;
12452 }
12453 j++;
12454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456
Victor Stinner7931d9a2011-11-04 00:22:48 +010012457 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458}
12459
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
12461static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012462do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 if (sep != NULL && sep != Py_None) {
12465 if (PyUnicode_Check(sep))
12466 return _PyUnicode_XStrip(self, striptype, sep);
12467 else {
12468 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 "%s arg must be None or str",
12470 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 return NULL;
12472 }
12473 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012474
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476}
12477
12478
INADA Naoki3ae20562017-01-16 20:41:20 +090012479/*[clinic input]
12480str.strip as unicode_strip
12481
12482 chars: object = None
12483 /
12484
Victor Stinner0c4a8282017-01-17 02:21:47 +010012485Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012486
12487If chars is given and not None, remove characters in chars instead.
12488[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012489
12490static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012491unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012492/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493{
INADA Naoki3ae20562017-01-16 20:41:20 +090012494 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495}
12496
12497
INADA Naoki3ae20562017-01-16 20:41:20 +090012498/*[clinic input]
12499str.lstrip as unicode_lstrip
12500
12501 chars: object = NULL
12502 /
12503
12504Return a copy of the string with leading whitespace removed.
12505
12506If chars is given and not None, remove characters in chars instead.
12507[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508
12509static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012510unicode_lstrip_impl(PyObject *self, PyObject *chars)
12511/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012512{
INADA Naoki3ae20562017-01-16 20:41:20 +090012513 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514}
12515
12516
INADA Naoki3ae20562017-01-16 20:41:20 +090012517/*[clinic input]
12518str.rstrip as unicode_rstrip
12519
12520 chars: object = NULL
12521 /
12522
12523Return a copy of the string with trailing whitespace removed.
12524
12525If chars is given and not None, remove characters in chars instead.
12526[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012527
12528static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012529unicode_rstrip_impl(PyObject *self, PyObject *chars)
12530/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012531{
INADA Naoki3ae20562017-01-16 20:41:20 +090012532 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012533}
12534
12535
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012537unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
Serhiy Storchaka05997252013-01-26 12:14:02 +020012542 if (len < 1)
12543 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Victor Stinnerc4b49542011-12-11 22:44:26 +010012545 /* no repeat, return original string */
12546 if (len == 1)
12547 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012548
Benjamin Petersonbac79492012-01-14 13:34:47 -050012549 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 return NULL;
12551
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012552 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553 PyErr_SetString(PyExc_OverflowError,
12554 "repeated string is too long");
12555 return NULL;
12556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012558
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012559 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560 if (!u)
12561 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012562 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 if (PyUnicode_GET_LENGTH(str) == 1) {
12565 const int kind = PyUnicode_KIND(str);
12566 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012567 if (kind == PyUnicode_1BYTE_KIND) {
12568 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012569 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012570 }
12571 else if (kind == PyUnicode_2BYTE_KIND) {
12572 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012573 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012574 ucs2[n] = fill_char;
12575 } else {
12576 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12577 assert(kind == PyUnicode_4BYTE_KIND);
12578 for (n = 0; n < len; ++n)
12579 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 }
12582 else {
12583 /* number of characters copied this far */
12584 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012585 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012587 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012591 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012592 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 }
12595
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012596 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012597 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
Alexander Belopolsky40018472011-02-26 01:02:56 +000012600PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012601PyUnicode_Replace(PyObject *str,
12602 PyObject *substr,
12603 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012604 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012606 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12607 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012609 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610}
12611
INADA Naoki3ae20562017-01-16 20:41:20 +090012612/*[clinic input]
12613str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
INADA Naoki3ae20562017-01-16 20:41:20 +090012615 old: unicode
12616 new: unicode
12617 count: Py_ssize_t = -1
12618 Maximum number of occurrences to replace.
12619 -1 (the default value) means replace all occurrences.
12620 /
12621
12622Return a copy with all occurrences of substring old replaced by new.
12623
12624If the optional argument count is given, only the first count occurrences are
12625replaced.
12626[clinic start generated code]*/
12627
12628static PyObject *
12629unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12630 Py_ssize_t count)
12631/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012633 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012635 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
Alexander Belopolsky40018472011-02-26 01:02:56 +000012638static PyObject *
12639unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012641 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 Py_ssize_t isize;
12643 Py_ssize_t osize, squote, dquote, i, o;
12644 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012645 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012649 return NULL;
12650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 isize = PyUnicode_GET_LENGTH(unicode);
12652 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 /* Compute length of output, quote characters, and
12655 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012656 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 max = 127;
12658 squote = dquote = 0;
12659 ikind = PyUnicode_KIND(unicode);
12660 for (i = 0; i < isize; i++) {
12661 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 case '\'': squote++; break;
12665 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 2;
12668 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 default:
12670 /* Fast-path ASCII */
12671 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012672 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012674 ;
12675 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012678 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012680 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012682 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012684 if (osize > PY_SSIZE_T_MAX - incr) {
12685 PyErr_SetString(PyExc_OverflowError,
12686 "string is too long to generate repr");
12687 return NULL;
12688 }
12689 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 }
12691
12692 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012693 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012695 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (dquote)
12697 /* Both squote and dquote present. Use squote,
12698 and escape them */
12699 osize += squote;
12700 else
12701 quote = '"';
12702 }
Victor Stinner55c08782013-04-14 18:45:39 +020012703 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704
12705 repr = PyUnicode_New(osize, max);
12706 if (repr == NULL)
12707 return NULL;
12708 okind = PyUnicode_KIND(repr);
12709 odata = PyUnicode_DATA(repr);
12710
12711 PyUnicode_WRITE(okind, odata, 0, quote);
12712 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012713 if (unchanged) {
12714 _PyUnicode_FastCopyCharacters(repr, 1,
12715 unicode, 0,
12716 isize);
12717 }
12718 else {
12719 for (i = 0, o = 1; i < isize; i++) {
12720 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721
Victor Stinner55c08782013-04-14 18:45:39 +020012722 /* Escape quotes and backslashes */
12723 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012724 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012726 continue;
12727 }
12728
12729 /* Map special whitespace to '\t', \n', '\r' */
12730 if (ch == '\t') {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 't');
12733 }
12734 else if (ch == '\n') {
12735 PyUnicode_WRITE(okind, odata, o++, '\\');
12736 PyUnicode_WRITE(okind, odata, o++, 'n');
12737 }
12738 else if (ch == '\r') {
12739 PyUnicode_WRITE(okind, odata, o++, '\\');
12740 PyUnicode_WRITE(okind, odata, o++, 'r');
12741 }
12742
12743 /* Map non-printable US ASCII to '\xhh' */
12744 else if (ch < ' ' || ch == 0x7F) {
12745 PyUnicode_WRITE(okind, odata, o++, '\\');
12746 PyUnicode_WRITE(okind, odata, o++, 'x');
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12749 }
12750
12751 /* Copy ASCII characters as-is */
12752 else if (ch < 0x7F) {
12753 PyUnicode_WRITE(okind, odata, o++, ch);
12754 }
12755
12756 /* Non-ASCII characters */
12757 else {
12758 /* Map Unicode whitespace and control characters
12759 (categories Z* and C* except ASCII space)
12760 */
12761 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12762 PyUnicode_WRITE(okind, odata, o++, '\\');
12763 /* Map 8-bit characters to '\xhh' */
12764 if (ch <= 0xff) {
12765 PyUnicode_WRITE(okind, odata, o++, 'x');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12768 }
12769 /* Map 16-bit characters to '\uxxxx' */
12770 else if (ch <= 0xffff) {
12771 PyUnicode_WRITE(okind, odata, o++, 'u');
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12776 }
12777 /* Map 21-bit characters to '\U00xxxxxx' */
12778 else {
12779 PyUnicode_WRITE(okind, odata, o++, 'U');
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12783 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12784 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12785 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12786 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12787 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12788 }
12789 }
12790 /* Copy characters as-is */
12791 else {
12792 PyUnicode_WRITE(okind, odata, o++, ch);
12793 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012794 }
12795 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012798 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012799 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800}
12801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012802PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804\n\
12805Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012806such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807arguments start and end are interpreted as in slice notation.\n\
12808\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
12811static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012814 /* initialize variables to prevent gcc warning */
12815 PyObject *substring = NULL;
12816 Py_ssize_t start = 0;
12817 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012820 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012826 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 if (result == -2)
12829 return NULL;
12830
Christian Heimes217cfd12007-12-02 14:31:20 +000012831 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832}
12833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012834PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012837Return the highest index in S where substring sub is found,\n\
12838such that sub is contained within S[start:end]. Optional\n\
12839arguments start and end are interpreted as in slice notation.\n\
12840\n\
12841Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
12843static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012846 /* initialize variables to prevent gcc warning */
12847 PyObject *substring = NULL;
12848 Py_ssize_t start = 0;
12849 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012850 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012852 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012855 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 if (result == -2)
12861 return NULL;
12862
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863 if (result < 0) {
12864 PyErr_SetString(PyExc_ValueError, "substring not found");
12865 return NULL;
12866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867
Christian Heimes217cfd12007-12-02 14:31:20 +000012868 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869}
12870
INADA Naoki3ae20562017-01-16 20:41:20 +090012871/*[clinic input]
12872str.rjust as unicode_rjust
12873
12874 width: Py_ssize_t
12875 fillchar: Py_UCS4 = ' '
12876 /
12877
12878Return a right-justified string of length width.
12879
12880Padding is done using the specified fill character (default is a space).
12881[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882
12883static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012884unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12885/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012887 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888 return NULL;
12889
Victor Stinnerc4b49542011-12-11 22:44:26 +010012890 if (PyUnicode_GET_LENGTH(self) >= width)
12891 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
Victor Stinnerc4b49542011-12-11 22:44:26 +010012893 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
Alexander Belopolsky40018472011-02-26 01:02:56 +000012896PyObject *
12897PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012899 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012902 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903}
12904
INADA Naoki3ae20562017-01-16 20:41:20 +090012905/*[clinic input]
12906str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907
INADA Naoki3ae20562017-01-16 20:41:20 +090012908 sep: object = None
12909 The delimiter according which to split the string.
12910 None (the default value) means split according to any whitespace,
12911 and discard empty strings from the result.
12912 maxsplit: Py_ssize_t = -1
12913 Maximum number of splits to do.
12914 -1 (the default value) means no limit.
12915
12916Return a list of the words in the string, using sep as the delimiter string.
12917[clinic start generated code]*/
12918
12919static PyObject *
12920unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12921/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922{
INADA Naoki3ae20562017-01-16 20:41:20 +090012923 if (sep == Py_None)
12924 return split(self, NULL, maxsplit);
12925 if (PyUnicode_Check(sep))
12926 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012927
12928 PyErr_Format(PyExc_TypeError,
12929 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012930 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932}
12933
Thomas Wouters477c8d52006-05-27 19:21:47 +000012934PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012935PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 int kind1, kind2;
12939 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012941
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012942 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012943 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012944
Victor Stinner14f8f022011-10-05 20:58:25 +020012945 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 len1 = PyUnicode_GET_LENGTH(str_obj);
12948 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 if (kind1 < kind2 || len1 < len2) {
12950 _Py_INCREF_UNICODE_EMPTY();
12951 if (!unicode_empty)
12952 out = NULL;
12953 else {
12954 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12955 Py_DECREF(unicode_empty);
12956 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012957 return out;
12958 }
12959 buf1 = PyUnicode_DATA(str_obj);
12960 buf2 = PyUnicode_DATA(sep_obj);
12961 if (kind2 != kind1) {
12962 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12963 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012964 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012969 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12970 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 else
12972 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 break;
12974 case PyUnicode_2BYTE_KIND:
12975 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12976 break;
12977 case PyUnicode_4BYTE_KIND:
12978 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12979 break;
12980 default:
12981 assert(0);
12982 out = 0;
12983 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012985 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987
12988 return out;
12989}
12990
12991
12992PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012996 int kind1, kind2;
12997 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013000 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013003 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 len1 = PyUnicode_GET_LENGTH(str_obj);
13006 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013007 if (kind1 < kind2 || len1 < len2) {
13008 _Py_INCREF_UNICODE_EMPTY();
13009 if (!unicode_empty)
13010 out = NULL;
13011 else {
13012 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13013 Py_DECREF(unicode_empty);
13014 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013015 return out;
13016 }
13017 buf1 = PyUnicode_DATA(str_obj);
13018 buf2 = PyUnicode_DATA(sep_obj);
13019 if (kind2 != kind1) {
13020 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13021 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013022 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013025 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013027 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13028 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13029 else
13030 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 break;
13032 case PyUnicode_2BYTE_KIND:
13033 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13034 break;
13035 case PyUnicode_4BYTE_KIND:
13036 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13037 break;
13038 default:
13039 assert(0);
13040 out = 0;
13041 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013042
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013043 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045
13046 return out;
13047}
13048
INADA Naoki3ae20562017-01-16 20:41:20 +090013049/*[clinic input]
13050str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051
INADA Naoki3ae20562017-01-16 20:41:20 +090013052 sep: object
13053 /
13054
13055Partition the string into three parts using the given separator.
13056
13057This will search for the separator in the string. If the separator is found,
13058returns a 3-tuple containing the part before the separator, the separator
13059itself, and the part after it.
13060
13061If the separator is not found, returns a 3-tuple containing the original string
13062and two empty strings.
13063[clinic start generated code]*/
13064
13065static PyObject *
13066unicode_partition(PyObject *self, PyObject *sep)
13067/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013068{
INADA Naoki3ae20562017-01-16 20:41:20 +090013069 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013070}
13071
INADA Naoki3ae20562017-01-16 20:41:20 +090013072/*[clinic input]
13073str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013074
INADA Naoki3ae20562017-01-16 20:41:20 +090013075Partition the string into three parts using the given separator.
13076
13077This will search for the separator in the string, starting and the end. If
13078the separator is found, returns a 3-tuple containing the part before the
13079separator, the separator itself, and the part after it.
13080
13081If the separator is not found, returns a 3-tuple containing two empty strings
13082and the original string.
13083[clinic start generated code]*/
13084
13085static PyObject *
13086unicode_rpartition(PyObject *self, PyObject *sep)
13087/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088{
INADA Naoki3ae20562017-01-16 20:41:20 +090013089 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013090}
13091
Alexander Belopolsky40018472011-02-26 01:02:56 +000013092PyObject *
13093PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013094{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013095 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013096 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013097
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013098 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013099}
13100
INADA Naoki3ae20562017-01-16 20:41:20 +090013101/*[clinic input]
13102str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013103
INADA Naoki3ae20562017-01-16 20:41:20 +090013104Return a list of the words in the string, using sep as the delimiter string.
13105
13106Splits are done starting at the end of the string and working to the front.
13107[clinic start generated code]*/
13108
13109static PyObject *
13110unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13111/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013112{
INADA Naoki3ae20562017-01-16 20:41:20 +090013113 if (sep == Py_None)
13114 return rsplit(self, NULL, maxsplit);
13115 if (PyUnicode_Check(sep))
13116 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013117
13118 PyErr_Format(PyExc_TypeError,
13119 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013120 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013121 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013122}
13123
INADA Naoki3ae20562017-01-16 20:41:20 +090013124/*[clinic input]
13125str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013127 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013128
13129Return a list of the lines in the string, breaking at line boundaries.
13130
13131Line breaks are not included in the resulting list unless keepends is given and
13132true.
13133[clinic start generated code]*/
13134
13135static PyObject *
13136unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013137/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013139 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140}
13141
13142static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013143PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013145 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146}
13147
INADA Naoki3ae20562017-01-16 20:41:20 +090013148/*[clinic input]
13149str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
INADA Naoki3ae20562017-01-16 20:41:20 +090013151Convert uppercase characters to lowercase and lowercase characters to uppercase.
13152[clinic start generated code]*/
13153
13154static PyObject *
13155unicode_swapcase_impl(PyObject *self)
13156/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013158 if (PyUnicode_READY(self) == -1)
13159 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013160 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161}
13162
Larry Hastings61272b72014-01-07 12:41:53 -080013163/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013164
Larry Hastings31826802013-10-19 00:09:25 -070013165@staticmethod
13166str.maketrans as unicode_maketrans
13167
13168 x: object
13169
13170 y: unicode=NULL
13171
13172 z: unicode=NULL
13173
13174 /
13175
13176Return a translation table usable for str.translate().
13177
13178If there is only one argument, it must be a dictionary mapping Unicode
13179ordinals (integers) or characters to Unicode ordinals, strings or None.
13180Character keys will be then converted to ordinals.
13181If there are two arguments, they must be strings of equal length, and
13182in the resulting dictionary, each character in x will be mapped to the
13183character at the same position in y. If there is a third argument, it
13184must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013185[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013186
Larry Hastings31826802013-10-19 00:09:25 -070013187static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013188unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013189/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013190{
Georg Brandlceee0772007-11-27 23:48:05 +000013191 PyObject *new = NULL, *key, *value;
13192 Py_ssize_t i = 0;
13193 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194
Georg Brandlceee0772007-11-27 23:48:05 +000013195 new = PyDict_New();
13196 if (!new)
13197 return NULL;
13198 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 int x_kind, y_kind, z_kind;
13200 void *x_data, *y_data, *z_data;
13201
Georg Brandlceee0772007-11-27 23:48:05 +000013202 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013203 if (!PyUnicode_Check(x)) {
13204 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13205 "be a string if there is a second argument");
13206 goto err;
13207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013209 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13210 "arguments must have equal length");
13211 goto err;
13212 }
13213 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 x_kind = PyUnicode_KIND(x);
13215 y_kind = PyUnicode_KIND(y);
13216 x_data = PyUnicode_DATA(x);
13217 y_data = PyUnicode_DATA(y);
13218 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13219 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013220 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013221 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013222 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013223 if (!value) {
13224 Py_DECREF(key);
13225 goto err;
13226 }
Georg Brandlceee0772007-11-27 23:48:05 +000013227 res = PyDict_SetItem(new, key, value);
13228 Py_DECREF(key);
13229 Py_DECREF(value);
13230 if (res < 0)
13231 goto err;
13232 }
13233 /* create entries for deleting chars in z */
13234 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 z_kind = PyUnicode_KIND(z);
13236 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013237 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013239 if (!key)
13240 goto err;
13241 res = PyDict_SetItem(new, key, Py_None);
13242 Py_DECREF(key);
13243 if (res < 0)
13244 goto err;
13245 }
13246 }
13247 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 int kind;
13249 void *data;
13250
Georg Brandlceee0772007-11-27 23:48:05 +000013251 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013252 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013253 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13254 "to maketrans it must be a dict");
13255 goto err;
13256 }
13257 /* copy entries into the new dict, converting string keys to int keys */
13258 while (PyDict_Next(x, &i, &key, &value)) {
13259 if (PyUnicode_Check(key)) {
13260 /* convert string keys to integer keys */
13261 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013262 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013263 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13264 "table must be of length 1");
13265 goto err;
13266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 kind = PyUnicode_KIND(key);
13268 data = PyUnicode_DATA(key);
13269 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013270 if (!newkey)
13271 goto err;
13272 res = PyDict_SetItem(new, newkey, value);
13273 Py_DECREF(newkey);
13274 if (res < 0)
13275 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013276 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013277 /* just keep integer keys */
13278 if (PyDict_SetItem(new, key, value) < 0)
13279 goto err;
13280 } else {
13281 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13282 "be strings or integers");
13283 goto err;
13284 }
13285 }
13286 }
13287 return new;
13288 err:
13289 Py_DECREF(new);
13290 return NULL;
13291}
13292
INADA Naoki3ae20562017-01-16 20:41:20 +090013293/*[clinic input]
13294str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
INADA Naoki3ae20562017-01-16 20:41:20 +090013296 table: object
13297 Translation table, which must be a mapping of Unicode ordinals to
13298 Unicode ordinals, strings, or None.
13299 /
13300
13301Replace each character in the string using the given translation table.
13302
13303The table must implement lookup/indexing via __getitem__, for instance a
13304dictionary or list. If this operation raises LookupError, the character is
13305left untouched. Characters mapped to None are deleted.
13306[clinic start generated code]*/
13307
13308static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013310/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313}
13314
INADA Naoki3ae20562017-01-16 20:41:20 +090013315/*[clinic input]
13316str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317
INADA Naoki3ae20562017-01-16 20:41:20 +090013318Return a copy of the string converted to uppercase.
13319[clinic start generated code]*/
13320
13321static PyObject *
13322unicode_upper_impl(PyObject *self)
13323/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013325 if (PyUnicode_READY(self) == -1)
13326 return NULL;
13327 if (PyUnicode_IS_ASCII(self))
13328 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013329 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330}
13331
INADA Naoki3ae20562017-01-16 20:41:20 +090013332/*[clinic input]
13333str.zfill as unicode_zfill
13334
13335 width: Py_ssize_t
13336 /
13337
13338Pad a numeric string with zeros on the left, to fill a field of the given width.
13339
13340The string is never truncated.
13341[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342
13343static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013344unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013345/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013347 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013348 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 int kind;
13350 void *data;
13351 Py_UCS4 chr;
13352
Benjamin Petersonbac79492012-01-14 13:34:47 -050013353 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
Victor Stinnerc4b49542011-12-11 22:44:26 +010013356 if (PyUnicode_GET_LENGTH(self) >= width)
13357 return unicode_result_unchanged(self);
13358
13359 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360
13361 u = pad(self, fill, 0, '0');
13362
Walter Dörwald068325e2002-04-15 13:36:47 +000013363 if (u == NULL)
13364 return NULL;
13365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 kind = PyUnicode_KIND(u);
13367 data = PyUnicode_DATA(u);
13368 chr = PyUnicode_READ(kind, data, fill);
13369
13370 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 PyUnicode_WRITE(kind, data, 0, chr);
13373 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374 }
13375
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013376 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013377 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379
13380#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013381static PyObject *
13382unicode__decimal2ascii(PyObject *self)
13383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013385}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386#endif
13387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013388PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013391Return True if S starts with the specified prefix, False otherwise.\n\
13392With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393With optional end, stop comparing S at that position.\n\
13394prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
13396static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013397unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013401 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013402 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013403 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405
Jesus Ceaac451502011-04-20 17:09:23 +020013406 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013408 if (PyTuple_Check(subobj)) {
13409 Py_ssize_t i;
13410 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 substring = PyTuple_GET_ITEM(subobj, i);
13412 if (!PyUnicode_Check(substring)) {
13413 PyErr_Format(PyExc_TypeError,
13414 "tuple for startswith must only contain str, "
13415 "not %.100s",
13416 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013417 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013418 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013420 if (result == -1)
13421 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013422 if (result) {
13423 Py_RETURN_TRUE;
13424 }
13425 }
13426 /* nothing matched */
13427 Py_RETURN_FALSE;
13428 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013429 if (!PyUnicode_Check(subobj)) {
13430 PyErr_Format(PyExc_TypeError,
13431 "startswith first arg must be str or "
13432 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013434 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013435 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013436 if (result == -1)
13437 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439}
13440
13441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013442PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013445Return True if S ends with the specified suffix, False otherwise.\n\
13446With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447With optional end, stop comparing S at that position.\n\
13448suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449
13450static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013451unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013454 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013455 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013456 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013457 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459
Jesus Ceaac451502011-04-20 17:09:23 +020013460 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013462 if (PyTuple_Check(subobj)) {
13463 Py_ssize_t i;
13464 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013465 substring = PyTuple_GET_ITEM(subobj, i);
13466 if (!PyUnicode_Check(substring)) {
13467 PyErr_Format(PyExc_TypeError,
13468 "tuple for endswith must only contain str, "
13469 "not %.100s",
13470 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013472 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013473 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013474 if (result == -1)
13475 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013476 if (result) {
13477 Py_RETURN_TRUE;
13478 }
13479 }
13480 Py_RETURN_FALSE;
13481 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013482 if (!PyUnicode_Check(subobj)) {
13483 PyErr_Format(PyExc_TypeError,
13484 "endswith first arg must be str or "
13485 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013487 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013488 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013489 if (result == -1)
13490 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013491 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013492}
13493
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013494static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013495_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013496{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013497 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13498 writer->data = PyUnicode_DATA(writer->buffer);
13499
13500 if (!writer->readonly) {
13501 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013503 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013504 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013505 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13506 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13507 writer->kind = PyUnicode_WCHAR_KIND;
13508 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13509
Victor Stinner8f674cc2013-04-17 23:02:17 +020013510 /* Copy-on-write mode: set buffer size to 0 so
13511 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13512 * next write. */
13513 writer->size = 0;
13514 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013515}
13516
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013518_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013519{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013520 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013521
13522 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013523 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013524
13525 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13526 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13527 writer->kind = PyUnicode_WCHAR_KIND;
13528 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013529}
13530
Victor Stinnerd3f08822012-05-29 12:57:52 +020013531int
13532_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13533 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013534{
13535 Py_ssize_t newlen;
13536 PyObject *newbuffer;
13537
Victor Stinner2740e462016-09-06 16:58:36 -070013538 assert(maxchar <= MAX_UNICODE);
13539
Victor Stinnerca9381e2015-09-22 00:58:32 +020013540 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013541 assert((maxchar > writer->maxchar && length >= 0)
13542 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543
Victor Stinner202fdca2012-05-07 12:47:02 +020013544 if (length > PY_SSIZE_T_MAX - writer->pos) {
13545 PyErr_NoMemory();
13546 return -1;
13547 }
13548 newlen = writer->pos + length;
13549
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013550 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013551
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013553 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013554 if (writer->overallocate
13555 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13556 /* overallocate to limit the number of realloc() */
13557 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 if (newlen < writer->min_length)
13560 newlen = writer->min_length;
13561
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 writer->buffer = PyUnicode_New(newlen, maxchar);
13563 if (writer->buffer == NULL)
13564 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013566 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013567 if (writer->overallocate
13568 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13569 /* overallocate to limit the number of realloc() */
13570 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013572 if (newlen < writer->min_length)
13573 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013575 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013577 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 newbuffer = PyUnicode_New(newlen, maxchar);
13579 if (newbuffer == NULL)
13580 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13582 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013583 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013584 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 }
13586 else {
13587 newbuffer = resize_compact(writer->buffer, newlen);
13588 if (newbuffer == NULL)
13589 return -1;
13590 }
13591 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013592 }
13593 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013594 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013595 newbuffer = PyUnicode_New(writer->size, maxchar);
13596 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013597 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013598 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13599 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013600 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013601 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013602 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013603 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013604
13605#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013606}
13607
Victor Stinnerca9381e2015-09-22 00:58:32 +020013608int
13609_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13610 enum PyUnicode_Kind kind)
13611{
13612 Py_UCS4 maxchar;
13613
13614 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13615 assert(writer->kind < kind);
13616
13617 switch (kind)
13618 {
13619 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13620 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13621 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13622 default:
13623 assert(0 && "invalid kind");
13624 return -1;
13625 }
13626
13627 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13628}
13629
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013630static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013631_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013632{
Victor Stinner2740e462016-09-06 16:58:36 -070013633 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013634 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13635 return -1;
13636 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13637 writer->pos++;
13638 return 0;
13639}
13640
13641int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013642_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13643{
13644 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13645}
13646
13647int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13649{
13650 Py_UCS4 maxchar;
13651 Py_ssize_t len;
13652
13653 if (PyUnicode_READY(str) == -1)
13654 return -1;
13655 len = PyUnicode_GET_LENGTH(str);
13656 if (len == 0)
13657 return 0;
13658 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13659 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013660 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013661 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013662 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013663 Py_INCREF(str);
13664 writer->buffer = str;
13665 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013666 writer->pos += len;
13667 return 0;
13668 }
13669 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13670 return -1;
13671 }
13672 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13673 str, 0, len);
13674 writer->pos += len;
13675 return 0;
13676}
13677
Victor Stinnere215d962012-10-06 23:03:36 +020013678int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013679_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13680 Py_ssize_t start, Py_ssize_t end)
13681{
13682 Py_UCS4 maxchar;
13683 Py_ssize_t len;
13684
13685 if (PyUnicode_READY(str) == -1)
13686 return -1;
13687
13688 assert(0 <= start);
13689 assert(end <= PyUnicode_GET_LENGTH(str));
13690 assert(start <= end);
13691
13692 if (end == 0)
13693 return 0;
13694
13695 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13696 return _PyUnicodeWriter_WriteStr(writer, str);
13697
13698 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13699 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13700 else
13701 maxchar = writer->maxchar;
13702 len = end - start;
13703
13704 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13705 return -1;
13706
13707 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13708 str, start, len);
13709 writer->pos += len;
13710 return 0;
13711}
13712
13713int
Victor Stinner4a587072013-11-19 12:54:53 +010013714_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13715 const char *ascii, Py_ssize_t len)
13716{
13717 if (len == -1)
13718 len = strlen(ascii);
13719
13720 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13721
13722 if (writer->buffer == NULL && !writer->overallocate) {
13723 PyObject *str;
13724
13725 str = _PyUnicode_FromASCII(ascii, len);
13726 if (str == NULL)
13727 return -1;
13728
13729 writer->readonly = 1;
13730 writer->buffer = str;
13731 _PyUnicodeWriter_Update(writer);
13732 writer->pos += len;
13733 return 0;
13734 }
13735
13736 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13737 return -1;
13738
13739 switch (writer->kind)
13740 {
13741 case PyUnicode_1BYTE_KIND:
13742 {
13743 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13744 Py_UCS1 *data = writer->data;
13745
Christian Heimesf051e432016-09-13 20:22:02 +020013746 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013747 break;
13748 }
13749 case PyUnicode_2BYTE_KIND:
13750 {
13751 _PyUnicode_CONVERT_BYTES(
13752 Py_UCS1, Py_UCS2,
13753 ascii, ascii + len,
13754 (Py_UCS2 *)writer->data + writer->pos);
13755 break;
13756 }
13757 case PyUnicode_4BYTE_KIND:
13758 {
13759 _PyUnicode_CONVERT_BYTES(
13760 Py_UCS1, Py_UCS4,
13761 ascii, ascii + len,
13762 (Py_UCS4 *)writer->data + writer->pos);
13763 break;
13764 }
13765 default:
13766 assert(0);
13767 }
13768
13769 writer->pos += len;
13770 return 0;
13771}
13772
13773int
13774_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13775 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013776{
13777 Py_UCS4 maxchar;
13778
13779 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13780 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13781 return -1;
13782 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13783 writer->pos += len;
13784 return 0;
13785}
13786
Victor Stinnerd3f08822012-05-29 12:57:52 +020013787PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013788_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013789{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013790 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013791
Victor Stinnerd3f08822012-05-29 12:57:52 +020013792 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013793 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013794 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013795 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013796
13797 str = writer->buffer;
13798 writer->buffer = NULL;
13799
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013800 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013801 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13802 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013803 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013804
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013805 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13806 PyObject *str2;
13807 str2 = resize_compact(str, writer->pos);
13808 if (str2 == NULL) {
13809 Py_DECREF(str);
13810 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013811 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013812 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013813 }
13814
Victor Stinner15a0bd32013-07-08 22:29:55 +020013815 assert(_PyUnicode_CheckConsistency(str, 1));
13816 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013817}
13818
Victor Stinnerd3f08822012-05-29 12:57:52 +020013819void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013820_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013821{
13822 Py_CLEAR(writer->buffer);
13823}
13824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013826
13827PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013829\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013830Return a formatted version of S, using substitutions from args and kwargs.\n\
13831The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013832
Eric Smith27bbca62010-11-04 17:06:58 +000013833PyDoc_STRVAR(format_map__doc__,
13834 "S.format_map(mapping) -> str\n\
13835\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013836Return a formatted version of S, using substitutions from mapping.\n\
13837The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013838
INADA Naoki3ae20562017-01-16 20:41:20 +090013839/*[clinic input]
13840str.__format__ as unicode___format__
13841
13842 format_spec: unicode
13843 /
13844
13845Return a formatted version of the string as described by format_spec.
13846[clinic start generated code]*/
13847
Eric Smith4a7d76d2008-05-30 18:10:19 +000013848static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013849unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013850/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013851{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013852 _PyUnicodeWriter writer;
13853 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013854
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 if (PyUnicode_READY(self) == -1)
13856 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013857 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013858 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13859 self, format_spec, 0,
13860 PyUnicode_GET_LENGTH(format_spec));
13861 if (ret == -1) {
13862 _PyUnicodeWriter_Dealloc(&writer);
13863 return NULL;
13864 }
13865 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013866}
13867
INADA Naoki3ae20562017-01-16 20:41:20 +090013868/*[clinic input]
13869str.__sizeof__ as unicode_sizeof
13870
13871Return the size of the string in memory, in bytes.
13872[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013873
13874static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013875unicode_sizeof_impl(PyObject *self)
13876/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 Py_ssize_t size;
13879
13880 /* If it's a compact object, account for base structure +
13881 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 if (PyUnicode_IS_COMPACT_ASCII(self))
13883 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13884 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013886 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887 else {
13888 /* If it is a two-block object, account for base object, and
13889 for character block if present. */
13890 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013891 if (_PyUnicode_DATA_ANY(self))
13892 size += (PyUnicode_GET_LENGTH(self) + 1) *
13893 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894 }
13895 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013896 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013897 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13898 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13899 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13900 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901
13902 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013903}
13904
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013905static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013906unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013907{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013908 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013909 if (!copy)
13910 return NULL;
13911 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013912}
13913
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013915 UNICODE_ENCODE_METHODDEF
13916 UNICODE_REPLACE_METHODDEF
13917 UNICODE_SPLIT_METHODDEF
13918 UNICODE_RSPLIT_METHODDEF
13919 UNICODE_JOIN_METHODDEF
13920 UNICODE_CAPITALIZE_METHODDEF
13921 UNICODE_CASEFOLD_METHODDEF
13922 UNICODE_TITLE_METHODDEF
13923 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013924 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013926 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013927 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013928 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013929 UNICODE_LJUST_METHODDEF
13930 UNICODE_LOWER_METHODDEF
13931 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013932 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13933 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013934 UNICODE_RJUST_METHODDEF
13935 UNICODE_RSTRIP_METHODDEF
13936 UNICODE_RPARTITION_METHODDEF
13937 UNICODE_SPLITLINES_METHODDEF
13938 UNICODE_STRIP_METHODDEF
13939 UNICODE_SWAPCASE_METHODDEF
13940 UNICODE_TRANSLATE_METHODDEF
13941 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013942 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13943 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 UNICODE_ISLOWER_METHODDEF
13945 UNICODE_ISUPPER_METHODDEF
13946 UNICODE_ISTITLE_METHODDEF
13947 UNICODE_ISSPACE_METHODDEF
13948 UNICODE_ISDECIMAL_METHODDEF
13949 UNICODE_ISDIGIT_METHODDEF
13950 UNICODE_ISNUMERIC_METHODDEF
13951 UNICODE_ISALPHA_METHODDEF
13952 UNICODE_ISALNUM_METHODDEF
13953 UNICODE_ISIDENTIFIER_METHODDEF
13954 UNICODE_ISPRINTABLE_METHODDEF
13955 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013956 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013957 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013958 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013959 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013960 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013961#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013962 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013963 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964#endif
13965
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967 {NULL, NULL}
13968};
13969
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013970static PyObject *
13971unicode_mod(PyObject *v, PyObject *w)
13972{
Brian Curtindfc80e32011-08-10 20:28:54 -050013973 if (!PyUnicode_Check(v))
13974 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013976}
13977
13978static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 0, /*nb_add*/
13980 0, /*nb_subtract*/
13981 0, /*nb_multiply*/
13982 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013983};
13984
Guido van Rossumd57fd912000-03-10 22:53:23 +000013985static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 (lenfunc) unicode_length, /* sq_length */
13987 PyUnicode_Concat, /* sq_concat */
13988 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13989 (ssizeargfunc) unicode_getitem, /* sq_item */
13990 0, /* sq_slice */
13991 0, /* sq_ass_item */
13992 0, /* sq_ass_slice */
13993 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994};
13995
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013997unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 if (PyUnicode_READY(self) == -1)
14000 return NULL;
14001
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014002 if (PyIndex_Check(item)) {
14003 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004 if (i == -1 && PyErr_Occurred())
14005 return NULL;
14006 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014007 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014008 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014009 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014010 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014011 PyObject *result;
14012 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014013 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014014 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014015
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014016 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017 return NULL;
14018 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014019 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14020 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014021
14022 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014023 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014024 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014025 slicelength == PyUnicode_GET_LENGTH(self)) {
14026 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014027 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014028 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014029 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014030 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014031 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 src_kind = PyUnicode_KIND(self);
14033 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014034 if (!PyUnicode_IS_ASCII(self)) {
14035 kind_limit = kind_maxchar_limit(src_kind);
14036 max_char = 0;
14037 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14038 ch = PyUnicode_READ(src_kind, src_data, cur);
14039 if (ch > max_char) {
14040 max_char = ch;
14041 if (max_char >= kind_limit)
14042 break;
14043 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014044 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014045 }
Victor Stinner55c99112011-10-13 01:17:06 +020014046 else
14047 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014048 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014049 if (result == NULL)
14050 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014051 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014052 dest_data = PyUnicode_DATA(result);
14053
14054 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014055 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14056 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014057 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014058 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014059 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014060 } else {
14061 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14062 return NULL;
14063 }
14064}
14065
14066static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 (lenfunc)unicode_length, /* mp_length */
14068 (binaryfunc)unicode_subscript, /* mp_subscript */
14069 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014070};
14071
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073/* Helpers for PyUnicode_Format() */
14074
Victor Stinnera47082312012-10-04 02:19:54 +020014075struct unicode_formatter_t {
14076 PyObject *args;
14077 int args_owned;
14078 Py_ssize_t arglen, argidx;
14079 PyObject *dict;
14080
14081 enum PyUnicode_Kind fmtkind;
14082 Py_ssize_t fmtcnt, fmtpos;
14083 void *fmtdata;
14084 PyObject *fmtstr;
14085
14086 _PyUnicodeWriter writer;
14087};
14088
14089struct unicode_format_arg_t {
14090 Py_UCS4 ch;
14091 int flags;
14092 Py_ssize_t width;
14093 int prec;
14094 int sign;
14095};
14096
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014098unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099{
Victor Stinnera47082312012-10-04 02:19:54 +020014100 Py_ssize_t argidx = ctx->argidx;
14101
14102 if (argidx < ctx->arglen) {
14103 ctx->argidx++;
14104 if (ctx->arglen < 0)
14105 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 else
Victor Stinnera47082312012-10-04 02:19:54 +020014107 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108 }
14109 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111 return NULL;
14112}
14113
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014114/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115
Victor Stinnera47082312012-10-04 02:19:54 +020014116/* Format a float into the writer if the writer is not NULL, or into *p_output
14117 otherwise.
14118
14119 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120static int
Victor Stinnera47082312012-10-04 02:19:54 +020014121formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14122 PyObject **p_output,
14123 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014125 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014126 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014127 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014128 int prec;
14129 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014130
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131 x = PyFloat_AsDouble(v);
14132 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014133 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014134
Victor Stinnera47082312012-10-04 02:19:54 +020014135 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014138
Victor Stinnera47082312012-10-04 02:19:54 +020014139 if (arg->flags & F_ALT)
14140 dtoa_flags = Py_DTSF_ALT;
14141 else
14142 dtoa_flags = 0;
14143 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014144 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014145 return -1;
14146 len = strlen(p);
14147 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014148 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014149 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014150 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014151 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 }
14153 else
14154 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014155 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014157}
14158
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159/* formatlong() emulates the format codes d, u, o, x and X, and
14160 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14161 * Python's regular ints.
14162 * Return value: a new PyUnicodeObject*, or NULL if error.
14163 * The output string is of the form
14164 * "-"? ("0x" | "0X")? digit+
14165 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14166 * set in flags. The case of hex digits will be correct,
14167 * There will be at least prec digits, zero-filled on the left if
14168 * necessary to get that many.
14169 * val object to be converted
14170 * flags bitmask of format flags; only F_ALT is looked at
14171 * prec minimum number of digits; 0-fill on left if needed
14172 * type a character in [duoxX]; u acts the same as d
14173 *
14174 * CAUTION: o, x and X conversions on regular ints can never
14175 * produce a '-' sign, but can for Python's unbounded ints.
14176 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014177PyObject *
14178_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014179{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 Py_ssize_t i;
14183 int sign; /* 1 if '-', else 0 */
14184 int len; /* number of characters */
14185 Py_ssize_t llen;
14186 int numdigits; /* len == numnondigits + numdigits */
14187 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014188
Victor Stinnerd0880d52012-04-27 23:40:13 +020014189 /* Avoid exceeding SSIZE_T_MAX */
14190 if (prec > INT_MAX-3) {
14191 PyErr_SetString(PyExc_OverflowError,
14192 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 }
14195
14196 assert(PyLong_Check(val));
14197
14198 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014199 default:
14200 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014202 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014203 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014204 /* int and int subclasses should print numerically when a numeric */
14205 /* format code is used (see issue18780) */
14206 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014207 break;
14208 case 'o':
14209 numnondigits = 2;
14210 result = PyNumber_ToBase(val, 8);
14211 break;
14212 case 'x':
14213 case 'X':
14214 numnondigits = 2;
14215 result = PyNumber_ToBase(val, 16);
14216 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014217 }
14218 if (!result)
14219 return NULL;
14220
14221 assert(unicode_modifiable(result));
14222 assert(PyUnicode_IS_READY(result));
14223 assert(PyUnicode_IS_ASCII(result));
14224
14225 /* To modify the string in-place, there can only be one reference. */
14226 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014227 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014228 PyErr_BadInternalCall();
14229 return NULL;
14230 }
14231 buf = PyUnicode_DATA(result);
14232 llen = PyUnicode_GET_LENGTH(result);
14233 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014234 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014235 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014236 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014237 return NULL;
14238 }
14239 len = (int)llen;
14240 sign = buf[0] == '-';
14241 numnondigits += sign;
14242 numdigits = len - numnondigits;
14243 assert(numdigits > 0);
14244
14245 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014246 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014247 (type == 'o' || type == 'x' || type == 'X'))) {
14248 assert(buf[sign] == '0');
14249 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14250 buf[sign+1] == 'o');
14251 numnondigits -= 2;
14252 buf += 2;
14253 len -= 2;
14254 if (sign)
14255 buf[0] = '-';
14256 assert(len == numnondigits + numdigits);
14257 assert(numdigits > 0);
14258 }
14259
14260 /* Fill with leading zeroes to meet minimum width. */
14261 if (prec > numdigits) {
14262 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14263 numnondigits + prec);
14264 char *b1;
14265 if (!r1) {
14266 Py_DECREF(result);
14267 return NULL;
14268 }
14269 b1 = PyBytes_AS_STRING(r1);
14270 for (i = 0; i < numnondigits; ++i)
14271 *b1++ = *buf++;
14272 for (i = 0; i < prec - numdigits; i++)
14273 *b1++ = '0';
14274 for (i = 0; i < numdigits; i++)
14275 *b1++ = *buf++;
14276 *b1 = '\0';
14277 Py_DECREF(result);
14278 result = r1;
14279 buf = PyBytes_AS_STRING(result);
14280 len = numnondigits + prec;
14281 }
14282
14283 /* Fix up case for hex conversions. */
14284 if (type == 'X') {
14285 /* Need to convert all lower case letters to upper case.
14286 and need to convert 0x to 0X (and -0x to -0X). */
14287 for (i = 0; i < len; i++)
14288 if (buf[i] >= 'a' && buf[i] <= 'x')
14289 buf[i] -= 'a'-'A';
14290 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 if (!PyUnicode_Check(result)
14292 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014293 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014294 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014295 Py_DECREF(result);
14296 result = unicode;
14297 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 else if (len != PyUnicode_GET_LENGTH(result)) {
14299 if (PyUnicode_Resize(&result, len) < 0)
14300 Py_CLEAR(result);
14301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014303}
14304
Ethan Furmandf3ed242014-01-05 06:50:30 -080014305/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014307 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014308 * -1 and raise an exception on error */
14309static int
Victor Stinnera47082312012-10-04 02:19:54 +020014310mainformatlong(PyObject *v,
14311 struct unicode_format_arg_t *arg,
14312 PyObject **p_output,
14313 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014314{
14315 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014316 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317
14318 if (!PyNumber_Check(v))
14319 goto wrongtype;
14320
Ethan Furman9ab74802014-03-21 06:38:46 -070014321 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014323 if (type == 'o' || type == 'x' || type == 'X') {
14324 iobj = PyNumber_Index(v);
14325 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014326 if (PyErr_ExceptionMatches(PyExc_TypeError))
14327 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014328 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014329 }
14330 }
14331 else {
14332 iobj = PyNumber_Long(v);
14333 if (iobj == NULL ) {
14334 if (PyErr_ExceptionMatches(PyExc_TypeError))
14335 goto wrongtype;
14336 return -1;
14337 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 }
14339 assert(PyLong_Check(iobj));
14340 }
14341 else {
14342 iobj = v;
14343 Py_INCREF(iobj);
14344 }
14345
14346 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014347 && arg->width == -1 && arg->prec == -1
14348 && !(arg->flags & (F_SIGN | F_BLANK))
14349 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014350 {
14351 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014352 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014353 int base;
14354
Victor Stinnera47082312012-10-04 02:19:54 +020014355 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014356 {
14357 default:
14358 assert(0 && "'type' not in [diuoxX]");
14359 case 'd':
14360 case 'i':
14361 case 'u':
14362 base = 10;
14363 break;
14364 case 'o':
14365 base = 8;
14366 break;
14367 case 'x':
14368 case 'X':
14369 base = 16;
14370 break;
14371 }
14372
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014373 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14374 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014376 }
14377 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378 return 1;
14379 }
14380
Ethan Furmanb95b5612015-01-23 20:05:18 -080014381 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014382 Py_DECREF(iobj);
14383 if (res == NULL)
14384 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014385 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386 return 0;
14387
14388wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014389 switch(type)
14390 {
14391 case 'o':
14392 case 'x':
14393 case 'X':
14394 PyErr_Format(PyExc_TypeError,
14395 "%%%c format: an integer is required, "
14396 "not %.200s",
14397 type, Py_TYPE(v)->tp_name);
14398 break;
14399 default:
14400 PyErr_Format(PyExc_TypeError,
14401 "%%%c format: a number is required, "
14402 "not %.200s",
14403 type, Py_TYPE(v)->tp_name);
14404 break;
14405 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014406 return -1;
14407}
14408
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014409static Py_UCS4
14410formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014411{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014412 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014413 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014414 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014415 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 goto onError;
14418 }
14419 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014422 /* make sure number is a type of integer */
14423 if (!PyLong_Check(v)) {
14424 iobj = PyNumber_Index(v);
14425 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014426 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014427 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014428 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014429 Py_DECREF(iobj);
14430 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014431 else {
14432 x = PyLong_AsLong(v);
14433 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 if (x == -1 && PyErr_Occurred())
14435 goto onError;
14436
Victor Stinner8faf8212011-12-08 22:14:11 +010014437 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014438 PyErr_SetString(PyExc_OverflowError,
14439 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014440 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014441 }
14442
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014443 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014445
Benjamin Peterson29060642009-01-31 22:14:21 +000014446 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014447 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014448 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014449 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014450}
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452/* Parse options of an argument: flags, width, precision.
14453 Handle also "%(name)" syntax.
14454
14455 Return 0 if the argument has been formatted into arg->str.
14456 Return 1 if the argument has been written into ctx->writer,
14457 Raise an exception and return -1 on error. */
14458static int
14459unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14460 struct unicode_format_arg_t *arg)
14461{
14462#define FORMAT_READ(ctx) \
14463 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14464
14465 PyObject *v;
14466
Victor Stinnera47082312012-10-04 02:19:54 +020014467 if (arg->ch == '(') {
14468 /* Get argument value from a dictionary. Example: "%(name)s". */
14469 Py_ssize_t keystart;
14470 Py_ssize_t keylen;
14471 PyObject *key;
14472 int pcount = 1;
14473
14474 if (ctx->dict == NULL) {
14475 PyErr_SetString(PyExc_TypeError,
14476 "format requires a mapping");
14477 return -1;
14478 }
14479 ++ctx->fmtpos;
14480 --ctx->fmtcnt;
14481 keystart = ctx->fmtpos;
14482 /* Skip over balanced parentheses */
14483 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14484 arg->ch = FORMAT_READ(ctx);
14485 if (arg->ch == ')')
14486 --pcount;
14487 else if (arg->ch == '(')
14488 ++pcount;
14489 ctx->fmtpos++;
14490 }
14491 keylen = ctx->fmtpos - keystart - 1;
14492 if (ctx->fmtcnt < 0 || pcount > 0) {
14493 PyErr_SetString(PyExc_ValueError,
14494 "incomplete format key");
14495 return -1;
14496 }
14497 key = PyUnicode_Substring(ctx->fmtstr,
14498 keystart, keystart + keylen);
14499 if (key == NULL)
14500 return -1;
14501 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014502 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014503 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014504 }
14505 ctx->args = PyObject_GetItem(ctx->dict, key);
14506 Py_DECREF(key);
14507 if (ctx->args == NULL)
14508 return -1;
14509 ctx->args_owned = 1;
14510 ctx->arglen = -1;
14511 ctx->argidx = -2;
14512 }
14513
14514 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014515 while (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 switch (arg->ch) {
14519 case '-': arg->flags |= F_LJUST; continue;
14520 case '+': arg->flags |= F_SIGN; continue;
14521 case ' ': arg->flags |= F_BLANK; continue;
14522 case '#': arg->flags |= F_ALT; continue;
14523 case '0': arg->flags |= F_ZERO; continue;
14524 }
14525 break;
14526 }
14527
14528 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014529 if (arg->ch == '*') {
14530 v = unicode_format_getnextarg(ctx);
14531 if (v == NULL)
14532 return -1;
14533 if (!PyLong_Check(v)) {
14534 PyErr_SetString(PyExc_TypeError,
14535 "* wants int");
14536 return -1;
14537 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014538 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014539 if (arg->width == -1 && PyErr_Occurred())
14540 return -1;
14541 if (arg->width < 0) {
14542 arg->flags |= F_LJUST;
14543 arg->width = -arg->width;
14544 }
14545 if (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 }
14549 }
14550 else if (arg->ch >= '0' && arg->ch <= '9') {
14551 arg->width = arg->ch - '0';
14552 while (--ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 ctx->fmtpos++;
14555 if (arg->ch < '0' || arg->ch > '9')
14556 break;
14557 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14558 mixing signed and unsigned comparison. Since arg->ch is between
14559 '0' and '9', casting to int is safe. */
14560 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14561 PyErr_SetString(PyExc_ValueError,
14562 "width too big");
14563 return -1;
14564 }
14565 arg->width = arg->width*10 + (arg->ch - '0');
14566 }
14567 }
14568
14569 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014570 if (arg->ch == '.') {
14571 arg->prec = 0;
14572 if (--ctx->fmtcnt >= 0) {
14573 arg->ch = FORMAT_READ(ctx);
14574 ctx->fmtpos++;
14575 }
14576 if (arg->ch == '*') {
14577 v = unicode_format_getnextarg(ctx);
14578 if (v == NULL)
14579 return -1;
14580 if (!PyLong_Check(v)) {
14581 PyErr_SetString(PyExc_TypeError,
14582 "* wants int");
14583 return -1;
14584 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014585 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014586 if (arg->prec == -1 && PyErr_Occurred())
14587 return -1;
14588 if (arg->prec < 0)
14589 arg->prec = 0;
14590 if (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 }
14594 }
14595 else if (arg->ch >= '0' && arg->ch <= '9') {
14596 arg->prec = arg->ch - '0';
14597 while (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 if (arg->ch < '0' || arg->ch > '9')
14601 break;
14602 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14603 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014604 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014605 return -1;
14606 }
14607 arg->prec = arg->prec*10 + (arg->ch - '0');
14608 }
14609 }
14610 }
14611
14612 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14613 if (ctx->fmtcnt >= 0) {
14614 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14615 if (--ctx->fmtcnt >= 0) {
14616 arg->ch = FORMAT_READ(ctx);
14617 ctx->fmtpos++;
14618 }
14619 }
14620 }
14621 if (ctx->fmtcnt < 0) {
14622 PyErr_SetString(PyExc_ValueError,
14623 "incomplete format");
14624 return -1;
14625 }
14626 return 0;
14627
14628#undef FORMAT_READ
14629}
14630
14631/* Format one argument. Supported conversion specifiers:
14632
14633 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014634 - "i", "d", "u": int or float
14635 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014636 - "e", "E", "f", "F", "g", "G": float
14637 - "c": int or str (1 character)
14638
Victor Stinner8dbd4212012-12-04 09:30:24 +010014639 When possible, the output is written directly into the Unicode writer
14640 (ctx->writer). A string is created when padding is required.
14641
Victor Stinnera47082312012-10-04 02:19:54 +020014642 Return 0 if the argument has been formatted into *p_str,
14643 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014644 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014645static int
14646unicode_format_arg_format(struct unicode_formatter_t *ctx,
14647 struct unicode_format_arg_t *arg,
14648 PyObject **p_str)
14649{
14650 PyObject *v;
14651 _PyUnicodeWriter *writer = &ctx->writer;
14652
14653 if (ctx->fmtcnt == 0)
14654 ctx->writer.overallocate = 0;
14655
Victor Stinnera47082312012-10-04 02:19:54 +020014656 v = unicode_format_getnextarg(ctx);
14657 if (v == NULL)
14658 return -1;
14659
Victor Stinnera47082312012-10-04 02:19:54 +020014660
14661 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014662 case 's':
14663 case 'r':
14664 case 'a':
14665 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14666 /* Fast path */
14667 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14668 return -1;
14669 return 1;
14670 }
14671
14672 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14673 *p_str = v;
14674 Py_INCREF(*p_str);
14675 }
14676 else {
14677 if (arg->ch == 's')
14678 *p_str = PyObject_Str(v);
14679 else if (arg->ch == 'r')
14680 *p_str = PyObject_Repr(v);
14681 else
14682 *p_str = PyObject_ASCII(v);
14683 }
14684 break;
14685
14686 case 'i':
14687 case 'd':
14688 case 'u':
14689 case 'o':
14690 case 'x':
14691 case 'X':
14692 {
14693 int ret = mainformatlong(v, arg, p_str, writer);
14694 if (ret != 0)
14695 return ret;
14696 arg->sign = 1;
14697 break;
14698 }
14699
14700 case 'e':
14701 case 'E':
14702 case 'f':
14703 case 'F':
14704 case 'g':
14705 case 'G':
14706 if (arg->width == -1 && arg->prec == -1
14707 && !(arg->flags & (F_SIGN | F_BLANK)))
14708 {
14709 /* Fast path */
14710 if (formatfloat(v, arg, NULL, writer) == -1)
14711 return -1;
14712 return 1;
14713 }
14714
14715 arg->sign = 1;
14716 if (formatfloat(v, arg, p_str, NULL) == -1)
14717 return -1;
14718 break;
14719
14720 case 'c':
14721 {
14722 Py_UCS4 ch = formatchar(v);
14723 if (ch == (Py_UCS4) -1)
14724 return -1;
14725 if (arg->width == -1 && arg->prec == -1) {
14726 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014727 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014728 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014729 return 1;
14730 }
14731 *p_str = PyUnicode_FromOrdinal(ch);
14732 break;
14733 }
14734
14735 default:
14736 PyErr_Format(PyExc_ValueError,
14737 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014738 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014739 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14740 (int)arg->ch,
14741 ctx->fmtpos - 1);
14742 return -1;
14743 }
14744 if (*p_str == NULL)
14745 return -1;
14746 assert (PyUnicode_Check(*p_str));
14747 return 0;
14748}
14749
14750static int
14751unicode_format_arg_output(struct unicode_formatter_t *ctx,
14752 struct unicode_format_arg_t *arg,
14753 PyObject *str)
14754{
14755 Py_ssize_t len;
14756 enum PyUnicode_Kind kind;
14757 void *pbuf;
14758 Py_ssize_t pindex;
14759 Py_UCS4 signchar;
14760 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014761 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014762 Py_ssize_t sublen;
14763 _PyUnicodeWriter *writer = &ctx->writer;
14764 Py_UCS4 fill;
14765
14766 fill = ' ';
14767 if (arg->sign && arg->flags & F_ZERO)
14768 fill = '0';
14769
14770 if (PyUnicode_READY(str) == -1)
14771 return -1;
14772
14773 len = PyUnicode_GET_LENGTH(str);
14774 if ((arg->width == -1 || arg->width <= len)
14775 && (arg->prec == -1 || arg->prec >= len)
14776 && !(arg->flags & (F_SIGN | F_BLANK)))
14777 {
14778 /* Fast path */
14779 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14780 return -1;
14781 return 0;
14782 }
14783
14784 /* Truncate the string for "s", "r" and "a" formats
14785 if the precision is set */
14786 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14787 if (arg->prec >= 0 && len > arg->prec)
14788 len = arg->prec;
14789 }
14790
14791 /* Adjust sign and width */
14792 kind = PyUnicode_KIND(str);
14793 pbuf = PyUnicode_DATA(str);
14794 pindex = 0;
14795 signchar = '\0';
14796 if (arg->sign) {
14797 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14798 if (ch == '-' || ch == '+') {
14799 signchar = ch;
14800 len--;
14801 pindex++;
14802 }
14803 else if (arg->flags & F_SIGN)
14804 signchar = '+';
14805 else if (arg->flags & F_BLANK)
14806 signchar = ' ';
14807 else
14808 arg->sign = 0;
14809 }
14810 if (arg->width < len)
14811 arg->width = len;
14812
14813 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014814 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014815 if (!(arg->flags & F_LJUST)) {
14816 if (arg->sign) {
14817 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014818 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014819 }
14820 else {
14821 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014822 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014823 }
14824 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014825 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14826 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014827 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014828 }
14829
Victor Stinnera47082312012-10-04 02:19:54 +020014830 buflen = arg->width;
14831 if (arg->sign && len == arg->width)
14832 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014833 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014834 return -1;
14835
14836 /* Write the sign if needed */
14837 if (arg->sign) {
14838 if (fill != ' ') {
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14840 writer->pos += 1;
14841 }
14842 if (arg->width > len)
14843 arg->width--;
14844 }
14845
14846 /* Write the numeric prefix for "x", "X" and "o" formats
14847 if the alternate form is used.
14848 For example, write "0x" for the "%#x" format. */
14849 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14850 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14851 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14852 if (fill != ' ') {
14853 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14855 writer->pos += 2;
14856 pindex += 2;
14857 }
14858 arg->width -= 2;
14859 if (arg->width < 0)
14860 arg->width = 0;
14861 len -= 2;
14862 }
14863
14864 /* Pad left with the fill character if needed */
14865 if (arg->width > len && !(arg->flags & F_LJUST)) {
14866 sublen = arg->width - len;
14867 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14868 writer->pos += sublen;
14869 arg->width = len;
14870 }
14871
14872 /* If padding with spaces: write sign if needed and/or numeric prefix if
14873 the alternate form is used */
14874 if (fill == ' ') {
14875 if (arg->sign) {
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14877 writer->pos += 1;
14878 }
14879 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14880 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14881 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14883 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14884 writer->pos += 2;
14885 pindex += 2;
14886 }
14887 }
14888
14889 /* Write characters */
14890 if (len) {
14891 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14892 str, pindex, len);
14893 writer->pos += len;
14894 }
14895
14896 /* Pad right with the fill character if needed */
14897 if (arg->width > len) {
14898 sublen = arg->width - len;
14899 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14900 writer->pos += sublen;
14901 }
14902 return 0;
14903}
14904
14905/* Helper of PyUnicode_Format(): format one arg.
14906 Return 0 on success, raise an exception and return -1 on error. */
14907static int
14908unicode_format_arg(struct unicode_formatter_t *ctx)
14909{
14910 struct unicode_format_arg_t arg;
14911 PyObject *str;
14912 int ret;
14913
Victor Stinner8dbd4212012-12-04 09:30:24 +010014914 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014915 if (arg.ch == '%') {
14916 ctx->fmtpos++;
14917 ctx->fmtcnt--;
14918 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14919 return -1;
14920 return 0;
14921 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014922 arg.flags = 0;
14923 arg.width = -1;
14924 arg.prec = -1;
14925 arg.sign = 0;
14926 str = NULL;
14927
Victor Stinnera47082312012-10-04 02:19:54 +020014928 ret = unicode_format_arg_parse(ctx, &arg);
14929 if (ret == -1)
14930 return -1;
14931
14932 ret = unicode_format_arg_format(ctx, &arg, &str);
14933 if (ret == -1)
14934 return -1;
14935
14936 if (ret != 1) {
14937 ret = unicode_format_arg_output(ctx, &arg, str);
14938 Py_DECREF(str);
14939 if (ret == -1)
14940 return -1;
14941 }
14942
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014943 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014944 PyErr_SetString(PyExc_TypeError,
14945 "not all arguments converted during string formatting");
14946 return -1;
14947 }
14948 return 0;
14949}
14950
Alexander Belopolsky40018472011-02-26 01:02:56 +000014951PyObject *
14952PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953{
Victor Stinnera47082312012-10-04 02:19:54 +020014954 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014955
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014957 PyErr_BadInternalCall();
14958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014959 }
Victor Stinnera47082312012-10-04 02:19:54 +020014960
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014961 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014962 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014963
14964 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014965 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14966 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14967 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14968 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014969
Victor Stinner8f674cc2013-04-17 23:02:17 +020014970 _PyUnicodeWriter_Init(&ctx.writer);
14971 ctx.writer.min_length = ctx.fmtcnt + 100;
14972 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014973
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.arglen = PyTuple_Size(args);
14976 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014977 }
14978 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014979 ctx.arglen = -1;
14980 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981 }
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014983 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014984 ctx.dict = args;
14985 else
14986 ctx.dict = NULL;
14987 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988
Victor Stinnera47082312012-10-04 02:19:54 +020014989 while (--ctx.fmtcnt >= 0) {
14990 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014991 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014992
14993 nonfmtpos = ctx.fmtpos++;
14994 while (ctx.fmtcnt >= 0 &&
14995 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14996 ctx.fmtpos++;
14997 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 }
Victor Stinnera47082312012-10-04 02:19:54 +020014999 if (ctx.fmtcnt < 0) {
15000 ctx.fmtpos--;
15001 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015002 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015003
Victor Stinnercfc4c132013-04-03 01:48:39 +020015004 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15005 nonfmtpos, ctx.fmtpos) < 0)
15006 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 }
15008 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015009 ctx.fmtpos++;
15010 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015011 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015012 }
15013 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015014
Victor Stinnera47082312012-10-04 02:19:54 +020015015 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015016 PyErr_SetString(PyExc_TypeError,
15017 "not all arguments converted during string formatting");
15018 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015019 }
15020
Victor Stinnera47082312012-10-04 02:19:54 +020015021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023 }
Victor Stinnera47082312012-10-04 02:19:54 +020015024 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015025
Benjamin Peterson29060642009-01-31 22:14:21 +000015026 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015027 _PyUnicodeWriter_Dealloc(&ctx.writer);
15028 if (ctx.args_owned) {
15029 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015030 }
15031 return NULL;
15032}
15033
Jeremy Hylton938ace62002-07-17 16:30:39 +000015034static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15036
Tim Peters6d6c1a32001-08-02 04:15:00 +000015037static PyObject *
15038unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15039{
Benjamin Peterson29060642009-01-31 22:14:21 +000015040 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 static char *kwlist[] = {"object", "encoding", "errors", 0};
15042 char *encoding = NULL;
15043 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015044
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 if (type != &PyUnicode_Type)
15046 return unicode_subtype_new(type, args, kwds);
15047 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 return NULL;
15050 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015051 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 if (encoding == NULL && errors == NULL)
15053 return PyObject_Str(x);
15054 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015055 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015056}
15057
Guido van Rossume023fe02001-08-30 03:12:59 +000015058static PyObject *
15059unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15060{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015061 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 Py_ssize_t length, char_size;
15063 int share_wstr, share_utf8;
15064 unsigned int kind;
15065 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015066
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015069 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015072 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015073 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015074 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015076 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015078 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 if (self == NULL) {
15080 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 return NULL;
15082 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 kind = PyUnicode_KIND(unicode);
15084 length = PyUnicode_GET_LENGTH(unicode);
15085
15086 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015087#ifdef Py_DEBUG
15088 _PyUnicode_HASH(self) = -1;
15089#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015091#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092 _PyUnicode_STATE(self).interned = 0;
15093 _PyUnicode_STATE(self).kind = kind;
15094 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015095 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015096 _PyUnicode_STATE(self).ready = 1;
15097 _PyUnicode_WSTR(self) = NULL;
15098 _PyUnicode_UTF8_LENGTH(self) = 0;
15099 _PyUnicode_UTF8(self) = NULL;
15100 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015101 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015102
15103 share_utf8 = 0;
15104 share_wstr = 0;
15105 if (kind == PyUnicode_1BYTE_KIND) {
15106 char_size = 1;
15107 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15108 share_utf8 = 1;
15109 }
15110 else if (kind == PyUnicode_2BYTE_KIND) {
15111 char_size = 2;
15112 if (sizeof(wchar_t) == 2)
15113 share_wstr = 1;
15114 }
15115 else {
15116 assert(kind == PyUnicode_4BYTE_KIND);
15117 char_size = 4;
15118 if (sizeof(wchar_t) == 4)
15119 share_wstr = 1;
15120 }
15121
15122 /* Ensure we won't overflow the length. */
15123 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15124 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015125 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015127 data = PyObject_MALLOC((length + 1) * char_size);
15128 if (data == NULL) {
15129 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015130 goto onError;
15131 }
15132
Victor Stinnerc3c74152011-10-02 20:39:55 +020015133 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015134 if (share_utf8) {
15135 _PyUnicode_UTF8_LENGTH(self) = length;
15136 _PyUnicode_UTF8(self) = data;
15137 }
15138 if (share_wstr) {
15139 _PyUnicode_WSTR_LENGTH(self) = length;
15140 _PyUnicode_WSTR(self) = (wchar_t *)data;
15141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142
Christian Heimesf051e432016-09-13 20:22:02 +020015143 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015144 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015145 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015146#ifdef Py_DEBUG
15147 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15148#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015149 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015150 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015151
15152onError:
15153 Py_DECREF(unicode);
15154 Py_DECREF(self);
15155 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015156}
15157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015158PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015159"str(object='') -> str\n\
15160str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015161\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015162Create a new string object from the given object. If encoding or\n\
15163errors is specified, then the object must expose a data buffer\n\
15164that will be decoded using the given encoding and error handler.\n\
15165Otherwise, returns the result of object.__str__() (if defined)\n\
15166or repr(object).\n\
15167encoding defaults to sys.getdefaultencoding().\n\
15168errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015169
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015170static PyObject *unicode_iter(PyObject *seq);
15171
Guido van Rossumd57fd912000-03-10 22:53:23 +000015172PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015173 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015174 "str", /* tp_name */
15175 sizeof(PyUnicodeObject), /* tp_size */
15176 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015178 (destructor)unicode_dealloc, /* tp_dealloc */
15179 0, /* tp_print */
15180 0, /* tp_getattr */
15181 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015182 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 unicode_repr, /* tp_repr */
15184 &unicode_as_number, /* tp_as_number */
15185 &unicode_as_sequence, /* tp_as_sequence */
15186 &unicode_as_mapping, /* tp_as_mapping */
15187 (hashfunc) unicode_hash, /* tp_hash*/
15188 0, /* tp_call*/
15189 (reprfunc) unicode_str, /* tp_str */
15190 PyObject_GenericGetAttr, /* tp_getattro */
15191 0, /* tp_setattro */
15192 0, /* tp_as_buffer */
15193 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015194 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 unicode_doc, /* tp_doc */
15196 0, /* tp_traverse */
15197 0, /* tp_clear */
15198 PyUnicode_RichCompare, /* tp_richcompare */
15199 0, /* tp_weaklistoffset */
15200 unicode_iter, /* tp_iter */
15201 0, /* tp_iternext */
15202 unicode_methods, /* tp_methods */
15203 0, /* tp_members */
15204 0, /* tp_getset */
15205 &PyBaseObject_Type, /* tp_base */
15206 0, /* tp_dict */
15207 0, /* tp_descr_get */
15208 0, /* tp_descr_set */
15209 0, /* tp_dictoffset */
15210 0, /* tp_init */
15211 0, /* tp_alloc */
15212 unicode_new, /* tp_new */
15213 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015214};
15215
15216/* Initialize the Unicode implementation */
15217
Victor Stinner3a50e702011-10-18 21:21:00 +020015218int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015219{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015220 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015221 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015222 0x000A, /* LINE FEED */
15223 0x000D, /* CARRIAGE RETURN */
15224 0x001C, /* FILE SEPARATOR */
15225 0x001D, /* GROUP SEPARATOR */
15226 0x001E, /* RECORD SEPARATOR */
15227 0x0085, /* NEXT LINE */
15228 0x2028, /* LINE SEPARATOR */
15229 0x2029, /* PARAGRAPH SEPARATOR */
15230 };
15231
Fred Drakee4315f52000-05-09 19:53:39 +000015232 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015233 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015234 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015235 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015236 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015237
Guido van Rossumcacfc072002-05-24 19:01:59 +000015238 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015239 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015240
15241 /* initialize the linebreak bloom filter */
15242 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015243 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015244 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015245
Christian Heimes26532f72013-07-20 14:57:16 +020015246 if (PyType_Ready(&EncodingMapType) < 0)
15247 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015248
Benjamin Petersonc4311282012-10-30 23:21:10 -040015249 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15250 Py_FatalError("Can't initialize field name iterator type");
15251
15252 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15253 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015254
Victor Stinner3a50e702011-10-18 21:21:00 +020015255 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015256}
15257
15258/* Finalize the Unicode implementation */
15259
Christian Heimesa156e092008-02-16 07:38:31 +000015260int
15261PyUnicode_ClearFreeList(void)
15262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015263 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015264}
15265
Guido van Rossumd57fd912000-03-10 22:53:23 +000015266void
Thomas Wouters78890102000-07-22 19:25:51 +000015267_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015268{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015269 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015270
Serhiy Storchaka05997252013-01-26 12:14:02 +020015271 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015272
Serhiy Storchaka05997252013-01-26 12:14:02 +020015273 for (i = 0; i < 256; i++)
15274 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015275 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015276 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015277}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015278
Walter Dörwald16807132007-05-25 13:52:07 +000015279void
15280PyUnicode_InternInPlace(PyObject **p)
15281{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015282 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015284#ifdef Py_DEBUG
15285 assert(s != NULL);
15286 assert(_PyUnicode_CHECK(s));
15287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015289 return;
15290#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 /* If it's a subclass, we don't really know what putting
15292 it in the interned dict might do. */
15293 if (!PyUnicode_CheckExact(s))
15294 return;
15295 if (PyUnicode_CHECK_INTERNED(s))
15296 return;
15297 if (interned == NULL) {
15298 interned = PyDict_New();
15299 if (interned == NULL) {
15300 PyErr_Clear(); /* Don't leave an exception */
15301 return;
15302 }
15303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015305 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015307 if (t == NULL) {
15308 PyErr_Clear();
15309 return;
15310 }
15311 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015312 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015313 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015314 return;
15315 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 /* The two references in interned are not counted by refcnt.
15317 The deallocator will take care of this */
15318 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015319 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015320}
15321
15322void
15323PyUnicode_InternImmortal(PyObject **p)
15324{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 PyUnicode_InternInPlace(p);
15326 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015327 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 Py_INCREF(*p);
15329 }
Walter Dörwald16807132007-05-25 13:52:07 +000015330}
15331
15332PyObject *
15333PyUnicode_InternFromString(const char *cp)
15334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 PyObject *s = PyUnicode_FromString(cp);
15336 if (s == NULL)
15337 return NULL;
15338 PyUnicode_InternInPlace(&s);
15339 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015340}
15341
Alexander Belopolsky40018472011-02-26 01:02:56 +000015342void
15343_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015344{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015346 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 Py_ssize_t i, n;
15348 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015349
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 if (interned == NULL || !PyDict_Check(interned))
15351 return;
15352 keys = PyDict_Keys(interned);
15353 if (keys == NULL || !PyList_Check(keys)) {
15354 PyErr_Clear();
15355 return;
15356 }
Walter Dörwald16807132007-05-25 13:52:07 +000015357
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15359 detector, interned unicode strings are not forcibly deallocated;
15360 rather, we give them their stolen references back, and then clear
15361 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015362
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 n = PyList_GET_SIZE(keys);
15364 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015365 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015367 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015368 if (PyUnicode_READY(s) == -1) {
15369 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015370 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015372 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 case SSTATE_NOT_INTERNED:
15374 /* XXX Shouldn't happen */
15375 break;
15376 case SSTATE_INTERNED_IMMORTAL:
15377 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015378 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 break;
15380 case SSTATE_INTERNED_MORTAL:
15381 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015382 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 break;
15384 default:
15385 Py_FatalError("Inconsistent interned string state.");
15386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015387 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 }
15389 fprintf(stderr, "total size of all interned strings: "
15390 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15391 "mortal/immortal\n", mortal_size, immortal_size);
15392 Py_DECREF(keys);
15393 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015394 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015395}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015396
15397
15398/********************* Unicode Iterator **************************/
15399
15400typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015401 PyObject_HEAD
15402 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015403 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015404} unicodeiterobject;
15405
15406static void
15407unicodeiter_dealloc(unicodeiterobject *it)
15408{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 _PyObject_GC_UNTRACK(it);
15410 Py_XDECREF(it->it_seq);
15411 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015412}
15413
15414static int
15415unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15416{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 Py_VISIT(it->it_seq);
15418 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419}
15420
15421static PyObject *
15422unicodeiter_next(unicodeiterobject *it)
15423{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015424 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015425
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 assert(it != NULL);
15427 seq = it->it_seq;
15428 if (seq == NULL)
15429 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015430 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015432 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15433 int kind = PyUnicode_KIND(seq);
15434 void *data = PyUnicode_DATA(seq);
15435 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15436 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 if (item != NULL)
15438 ++it->it_index;
15439 return item;
15440 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015441
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015443 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015445}
15446
15447static PyObject *
15448unicodeiter_len(unicodeiterobject *it)
15449{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 Py_ssize_t len = 0;
15451 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015452 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015454}
15455
15456PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15457
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015458static PyObject *
15459unicodeiter_reduce(unicodeiterobject *it)
15460{
15461 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015462 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015463 it->it_seq, it->it_index);
15464 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015465 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015466 if (u == NULL)
15467 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015468 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015469 }
15470}
15471
15472PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15473
15474static PyObject *
15475unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15476{
15477 Py_ssize_t index = PyLong_AsSsize_t(state);
15478 if (index == -1 && PyErr_Occurred())
15479 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015480 if (it->it_seq != NULL) {
15481 if (index < 0)
15482 index = 0;
15483 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15484 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15485 it->it_index = index;
15486 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015487 Py_RETURN_NONE;
15488}
15489
15490PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15491
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015492static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015493 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015494 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015495 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15496 reduce_doc},
15497 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15498 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015499 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015500};
15501
15502PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15504 "str_iterator", /* tp_name */
15505 sizeof(unicodeiterobject), /* tp_basicsize */
15506 0, /* tp_itemsize */
15507 /* methods */
15508 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15509 0, /* tp_print */
15510 0, /* tp_getattr */
15511 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015512 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015513 0, /* tp_repr */
15514 0, /* tp_as_number */
15515 0, /* tp_as_sequence */
15516 0, /* tp_as_mapping */
15517 0, /* tp_hash */
15518 0, /* tp_call */
15519 0, /* tp_str */
15520 PyObject_GenericGetAttr, /* tp_getattro */
15521 0, /* tp_setattro */
15522 0, /* tp_as_buffer */
15523 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15524 0, /* tp_doc */
15525 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15526 0, /* tp_clear */
15527 0, /* tp_richcompare */
15528 0, /* tp_weaklistoffset */
15529 PyObject_SelfIter, /* tp_iter */
15530 (iternextfunc)unicodeiter_next, /* tp_iternext */
15531 unicodeiter_methods, /* tp_methods */
15532 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015533};
15534
15535static PyObject *
15536unicode_iter(PyObject *seq)
15537{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015538 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015539
Benjamin Peterson14339b62009-01-31 16:36:08 +000015540 if (!PyUnicode_Check(seq)) {
15541 PyErr_BadInternalCall();
15542 return NULL;
15543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015544 if (PyUnicode_READY(seq) == -1)
15545 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015546 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15547 if (it == NULL)
15548 return NULL;
15549 it->it_index = 0;
15550 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015551 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015552 _PyObject_GC_TRACK(it);
15553 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015554}
15555
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015556
15557size_t
15558Py_UNICODE_strlen(const Py_UNICODE *u)
15559{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015560 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015561}
15562
15563Py_UNICODE*
15564Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15565{
15566 Py_UNICODE *u = s1;
15567 while ((*u++ = *s2++));
15568 return s1;
15569}
15570
15571Py_UNICODE*
15572Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15573{
15574 Py_UNICODE *u = s1;
15575 while ((*u++ = *s2++))
15576 if (n-- == 0)
15577 break;
15578 return s1;
15579}
15580
15581Py_UNICODE*
15582Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15583{
15584 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015585 u1 += wcslen(u1);
15586 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015587 return s1;
15588}
15589
15590int
15591Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15592{
15593 while (*s1 && *s2 && *s1 == *s2)
15594 s1++, s2++;
15595 if (*s1 && *s2)
15596 return (*s1 < *s2) ? -1 : +1;
15597 if (*s1)
15598 return 1;
15599 if (*s2)
15600 return -1;
15601 return 0;
15602}
15603
15604int
15605Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15606{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015607 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015608 for (; n != 0; n--) {
15609 u1 = *s1;
15610 u2 = *s2;
15611 if (u1 != u2)
15612 return (u1 < u2) ? -1 : +1;
15613 if (u1 == '\0')
15614 return 0;
15615 s1++;
15616 s2++;
15617 }
15618 return 0;
15619}
15620
15621Py_UNICODE*
15622Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15623{
15624 const Py_UNICODE *p;
15625 for (p = s; *p; p++)
15626 if (*p == c)
15627 return (Py_UNICODE*)p;
15628 return NULL;
15629}
15630
15631Py_UNICODE*
15632Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15633{
15634 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015635 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015636 while (p != s) {
15637 p--;
15638 if (*p == c)
15639 return (Py_UNICODE*)p;
15640 }
15641 return NULL;
15642}
Victor Stinner331ea922010-08-10 16:37:20 +000015643
Victor Stinner71133ff2010-09-01 23:43:53 +000015644Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015645PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015646{
Victor Stinner577db2c2011-10-11 22:12:48 +020015647 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015648 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015650 if (!PyUnicode_Check(unicode)) {
15651 PyErr_BadArgument();
15652 return NULL;
15653 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015654 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015655 if (u == NULL)
15656 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015657 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015658 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015659 PyErr_NoMemory();
15660 return NULL;
15661 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015662 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015663 size *= sizeof(Py_UNICODE);
15664 copy = PyMem_Malloc(size);
15665 if (copy == NULL) {
15666 PyErr_NoMemory();
15667 return NULL;
15668 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015669 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015670 return copy;
15671}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015672
Georg Brandl66c221e2010-10-14 07:04:07 +000015673/* A _string module, to export formatter_parser and formatter_field_name_split
15674 to the string.Formatter class implemented in Python. */
15675
15676static PyMethodDef _string_methods[] = {
15677 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15678 METH_O, PyDoc_STR("split the argument as a field name")},
15679 {"formatter_parser", (PyCFunction) formatter_parser,
15680 METH_O, PyDoc_STR("parse the argument as a format string")},
15681 {NULL, NULL}
15682};
15683
15684static struct PyModuleDef _string_module = {
15685 PyModuleDef_HEAD_INIT,
15686 "_string",
15687 PyDoc_STR("string helper module"),
15688 0,
15689 _string_methods,
15690 NULL,
15691 NULL,
15692 NULL,
15693 NULL
15694};
15695
15696PyMODINIT_FUNC
15697PyInit__string(void)
15698{
15699 return PyModule_Create(&_string_module);
15700}
15701
15702
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015703#ifdef __cplusplus
15704}
15705#endif