blob: 1a696cc5c89ead6e078a8511d91d49bd877e0d78 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957 convert a Unicode object to a wide character string.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) required to convert the unicode object. Ignore size argument.
2961
Victor Stinnerd88d9832011-09-06 02:00:05 +02002962 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002966unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002967 wchar_t *w,
2968 Py_ssize_t size)
2969{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 const wchar_t *wstr;
2972
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002973 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002974 if (wstr == NULL)
2975 return -1;
2976
Victor Stinner5593d8a2010-10-02 11:11:27 +00002977 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (size > res)
2979 size = res + 1;
2980 else
2981 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002982 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 return res;
2984 }
2985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002987}
2988
2989Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002990PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002991 wchar_t *w,
2992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyErr_BadInternalCall();
2996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002998 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999}
3000
Victor Stinner137c34c2010-09-29 10:25:54 +00003001wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003002PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 Py_ssize_t *size)
3004{
3005 wchar_t* buffer;
3006 Py_ssize_t buflen;
3007
3008 if (unicode == NULL) {
3009 PyErr_BadInternalCall();
3010 return NULL;
3011 }
3012
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003013 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 if (buflen == -1)
3015 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003016 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003021 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003022 if (buflen == -1) {
3023 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003025 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003026 if (size != NULL)
3027 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003028 return buffer;
3029}
3030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
3034PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035{
Victor Stinner8faf8212011-12-08 22:14:11 +01003036 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 PyErr_SetString(PyExc_ValueError,
3038 "chr() arg not in range(0x110000)");
3039 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003041
Victor Stinner985a82a2014-01-03 12:53:47 +01003042 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003046PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003051 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003052 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 Py_INCREF(obj);
3054 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
3056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 /* For a Unicode subtype that's not a Unicode object,
3058 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003059 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 PyErr_Format(PyExc_TypeError,
3062 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003063 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003064 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003068PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003072 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003073 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003074
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 PyErr_BadInternalCall();
3077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Decoding bytes objects is the most common case and should be fast */
3081 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003082 if (PyBytes_GET_SIZE(obj) == 0)
3083 _Py_RETURN_UNICODE_EMPTY();
3084 v = PyUnicode_Decode(
3085 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 return v;
3088 }
3089
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 PyErr_SetString(PyExc_TypeError,
3092 "decoding str is not supported");
3093 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003095
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003099 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 Py_TYPE(obj)->tp_name);
3101 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003102 }
Tim Petersced69f82003-09-16 20:30:58 +00003103
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003104 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003105 PyBuffer_Release(&buffer);
3106 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003108
Serhiy Storchaka05997252013-01-26 12:14:02 +02003109 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003111 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
Victor Stinnerebe17e02016-10-12 13:57:45 +02003114/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003117int
3118_Py_normalize_encoding(const char *encoding,
3119 char *lower,
3120 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003123 char *l;
3124 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003125 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126
Victor Stinner942889a2016-09-05 15:40:10 -07003127 assert(encoding != NULL);
3128
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003129 e = encoding;
3130 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003131 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003132 punct = 0;
3133 while (1) {
3134 char c = *e;
3135 if (c == 0) {
3136 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003137 }
Victor Stinner942889a2016-09-05 15:40:10 -07003138
3139 if (Py_ISALNUM(c) || c == '.') {
3140 if (punct && l != lower) {
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = '_';
3145 }
3146 punct = 0;
3147
3148 if (l == l_end) {
3149 return 0;
3150 }
3151 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
3153 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003154 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
Victor Stinner942889a2016-09-05 15:40:10 -07003156
3157 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003160 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 Py_ssize_t size,
3166 const char *encoding,
3167 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003168{
3169 PyObject *buffer = NULL, *unicode;
3170 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003171 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172
3173 if (encoding == NULL) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003176
Fred Drakee4315f52000-05-09 19:53:39 +00003177 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003178 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179 char *lower = buflower;
3180
3181 /* Fast paths */
3182 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183 lower += 3;
3184 if (*lower == '_') {
3185 /* Match "utf8" and "utf_8" */
3186 lower++;
3187 }
3188
3189 if (lower[0] == '8' && lower[1] == 0) {
3190 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191 }
3192 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194 }
3195 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197 }
3198 }
3199 else {
3200 if (strcmp(lower, "ascii") == 0
3201 || strcmp(lower, "us_ascii") == 0) {
3202 return PyUnicode_DecodeASCII(s, size, errors);
3203 }
Steve Dowercc16be82016-09-08 10:35:16 -07003204 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003205 else if (strcmp(lower, "mbcs") == 0) {
3206 return PyUnicode_DecodeMBCS(s, size, errors);
3207 }
3208 #endif
3209 else if (strcmp(lower, "latin1") == 0
3210 || strcmp(lower, "latin_1") == 0
3211 || strcmp(lower, "iso_8859_1") == 0
3212 || strcmp(lower, "iso8859_1") == 0) {
3213 return PyUnicode_DecodeLatin1(s, size, errors);
3214 }
3215 }
Victor Stinner37296e82010-06-10 13:36:23 +00003216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217
3218 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003219 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003220 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003221 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003222 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (buffer == NULL)
3224 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003225 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (unicode == NULL)
3227 goto onError;
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003230 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231 "use codecs.decode() to decode to arbitrary types",
3232 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003233 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_DECREF(unicode);
3235 goto onError;
3236 }
3237 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003238 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_XDECREF(buffer);
3242 return NULL;
3243}
3244
Alexander Belopolsky40018472011-02-26 01:02:56 +00003245PyObject *
3246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003247 const char *encoding,
3248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003252 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 }
3254
Serhiy Storchaka00939072016-10-27 21:05:49 +03003255 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256 "PyUnicode_AsDecodedObject() is deprecated; "
3257 "use PyCodec_Decode() to decode from str", 1) < 0)
3258 return NULL;
3259
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262
3263 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003264 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003265}
3266
Alexander Belopolsky40018472011-02-26 01:02:56 +00003267PyObject *
3268PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003269 const char *encoding,
3270 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003271{
3272 PyObject *v;
3273
3274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
3276 goto onError;
3277 }
3278
Serhiy Storchaka00939072016-10-27 21:05:49 +03003279 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280 "PyUnicode_AsDecodedUnicode() is deprecated; "
3281 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282 return NULL;
3283
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286
3287 /* Decode via the codec registry */
3288 v = PyCodec_Decode(unicode, encoding, errors);
3289 if (v == NULL)
3290 goto onError;
3291 if (!PyUnicode_Check(v)) {
3292 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003293 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294 "use codecs.decode() to decode to arbitrary types",
3295 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003296 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003297 Py_DECREF(v);
3298 goto onError;
3299 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003300 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003301
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 return NULL;
3304}
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 Py_ssize_t size,
3309 const char *encoding,
3310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311{
3312 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003314 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318 Py_DECREF(unicode);
3319 return v;
3320}
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322PyObject *
3323PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding,
3325 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326{
3327 PyObject *v;
3328
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 goto onError;
3332 }
3333
Serhiy Storchaka00939072016-10-27 21:05:49 +03003334 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335 "PyUnicode_AsEncodedObject() is deprecated; "
3336 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337 "or PyCodec_Encode() for generic encoding", 1) < 0)
3338 return NULL;
3339
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003340 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342
3343 /* Encode via the codec registry */
3344 v = PyCodec_Encode(unicode, encoding, errors);
3345 if (v == NULL)
3346 goto onError;
3347 return v;
3348
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003350 return NULL;
3351}
3352
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353static size_t
3354wcstombs_errorpos(const wchar_t *wstr)
3355{
3356 size_t len;
3357#if SIZEOF_WCHAR_T == 2
3358 wchar_t buf[3];
3359#else
3360 wchar_t buf[2];
3361#endif
3362 char outbuf[MB_LEN_MAX];
3363 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365#if SIZEOF_WCHAR_T == 2
3366 buf[2] = 0;
3367#else
3368 buf[1] = 0;
3369#endif
3370 start = wstr;
3371 while (*wstr != L'\0')
3372 {
3373 previous = wstr;
3374#if SIZEOF_WCHAR_T == 2
3375 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377 {
3378 buf[0] = wstr[0];
3379 buf[1] = wstr[1];
3380 wstr += 2;
3381 }
3382 else {
3383 buf[0] = *wstr;
3384 buf[1] = 0;
3385 wstr++;
3386 }
3387#else
3388 buf[0] = *wstr;
3389 wstr++;
3390#endif
3391 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003392 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 }
3395
3396 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return 0;
3398}
3399
Victor Stinner1b579672011-12-17 05:47:23 +01003400static int
3401locale_error_handler(const char *errors, int *surrogateescape)
3402{
Victor Stinner50149202015-09-22 00:26:54 +02003403 _Py_error_handler error_handler = get_error_handler(errors);
3404 switch (error_handler)
3405 {
3406 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 0;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003410 *surrogateescape = 1;
3411 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003412 default:
3413 PyErr_Format(PyExc_ValueError,
3414 "only 'strict' and 'surrogateescape' error handlers "
3415 "are supported, not '%s'",
3416 errors);
3417 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003418 }
Victor Stinner1b579672011-12-17 05:47:23 +01003419}
3420
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003422PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423{
3424 Py_ssize_t wlen, wlen2;
3425 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003427 PyObject *bytes, *reason, *exc;
3428 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003429 int surrogateescape;
3430
3431 if (locale_error_handler(errors, &surrogateescape) < 0)
3432 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433
3434 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435 if (wstr == NULL)
3436 return NULL;
3437
3438 wlen2 = wcslen(wstr);
3439 if (wlen2 != wlen) {
3440 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003441 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 return NULL;
3443 }
3444
3445 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003446 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 char *str;
3448
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003449 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 if (str == NULL) {
3451 if (error_pos == (size_t)-1) {
3452 PyErr_NoMemory();
3453 PyMem_Free(wstr);
3454 return NULL;
3455 }
3456 else {
3457 goto encode_error;
3458 }
3459 }
3460 PyMem_Free(wstr);
3461
3462 bytes = PyBytes_FromString(str);
3463 PyMem_Free(str);
3464 }
3465 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 size_t len, len2;
3468
3469 len = wcstombs(NULL, wstr, 0);
3470 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003471 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 goto encode_error;
3473 }
3474
3475 bytes = PyBytes_FromStringAndSize(NULL, len);
3476 if (bytes == NULL) {
3477 PyMem_Free(wstr);
3478 return NULL;
3479 }
3480
3481 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003483 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003485 goto encode_error;
3486 }
3487 PyMem_Free(wstr);
3488 }
3489 return bytes;
3490
3491encode_error:
3492 errmsg = strerror(errno);
3493 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003494
3495 if (error_pos == (size_t)-1)
3496 error_pos = wcstombs_errorpos(wstr);
3497
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500 wstr = Py_DecodeLocale(errmsg, &errlen);
3501 if (wstr != NULL) {
3502 reason = PyUnicode_FromWideChar(wstr, errlen);
3503 PyMem_RawFree(wstr);
3504 } else {
3505 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003506 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003507
Victor Stinner2f197072011-12-17 07:08:30 +01003508 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003509 reason = PyUnicode_FromString(
3510 "wcstombs() encountered an unencodable "
3511 "wide character");
3512 if (reason == NULL)
3513 return NULL;
3514
3515 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516 "locale", unicode,
3517 (Py_ssize_t)error_pos,
3518 (Py_ssize_t)(error_pos+1),
3519 reason);
3520 Py_DECREF(reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003523 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003524 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 return NULL;
3526}
3527
Victor Stinnerad158722010-10-27 00:25:46 +00003528PyObject *
3529PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530{
Steve Dowercc16be82016-09-08 10:35:16 -07003531#if defined(__APPLE__)
3532 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003533#else
Victor Stinner793b5312011-04-27 00:24:21 +02003534 PyInterpreterState *interp = PyThreadState_GET()->interp;
3535 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536 cannot use it to encode and decode filenames before it is loaded. Load
3537 the Python codec requires to encode at least its own filename. Use the C
3538 version of the locale codec until the codec registry is initialized and
3539 the Python codec is loaded.
3540
3541 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542 cannot only rely on it: check also interp->fscodec_initialized for
3543 subinterpreters. */
3544 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545 return PyUnicode_AsEncodedString(unicode,
3546 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003547 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
3549 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003550 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003551 }
Victor Stinnerad158722010-10-27 00:25:46 +00003552#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555PyObject *
3556PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003557 const char *encoding,
3558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559{
3560 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003561 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Fred Drakee4315f52000-05-09 19:53:39 +00003567
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (encoding == NULL) {
3569 return _PyUnicode_AsUTF8String(unicode, errors);
3570 }
3571
Fred Drakee4315f52000-05-09 19:53:39 +00003572 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003573 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574 char *lower = buflower;
3575
3576 /* Fast paths */
3577 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578 lower += 3;
3579 if (*lower == '_') {
3580 /* Match "utf8" and "utf_8" */
3581 lower++;
3582 }
3583
3584 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003586 }
3587 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589 }
3590 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003593 }
Victor Stinner942889a2016-09-05 15:40:10 -07003594 else {
3595 if (strcmp(lower, "ascii") == 0
3596 || strcmp(lower, "us_ascii") == 0) {
3597 return _PyUnicode_AsASCIIString(unicode, errors);
3598 }
Steve Dowercc16be82016-09-08 10:35:16 -07003599#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003600 else if (strcmp(lower, "mbcs") == 0) {
3601 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003603#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003604 else if (strcmp(lower, "latin1") == 0 ||
3605 strcmp(lower, "latin_1") == 0 ||
3606 strcmp(lower, "iso_8859_1") == 0 ||
3607 strcmp(lower, "iso8859_1") == 0) {
3608 return _PyUnicode_AsLatin1String(unicode, errors);
3609 }
3610 }
Victor Stinner37296e82010-06-10 13:36:23 +00003611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612
3613 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003614 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 return NULL;
3617
3618 /* The normal path */
3619 if (PyBytes_Check(v))
3620 return v;
3621
3622 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003624 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003626
3627 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003628 "encoder %s returned bytearray instead of bytes; "
3629 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003630 encoding);
3631 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003632 Py_DECREF(v);
3633 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003636 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3637 Py_DECREF(v);
3638 return b;
3639 }
3640
3641 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003642 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3643 "use codecs.encode() to encode to arbitrary types",
3644 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003645 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003646 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 return NULL;
3648}
3649
Alexander Belopolsky40018472011-02-26 01:02:56 +00003650PyObject *
3651PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003652 const char *encoding,
3653 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003654{
3655 PyObject *v;
3656
3657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
3659 goto onError;
3660 }
3661
Serhiy Storchaka00939072016-10-27 21:05:49 +03003662 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3663 "PyUnicode_AsEncodedUnicode() is deprecated; "
3664 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3665 return NULL;
3666
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003667 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003669
3670 /* Encode via the codec registry */
3671 v = PyCodec_Encode(unicode, encoding, errors);
3672 if (v == NULL)
3673 goto onError;
3674 if (!PyUnicode_Check(v)) {
3675 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003676 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3677 "use codecs.encode() to encode to arbitrary types",
3678 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003679 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003680 Py_DECREF(v);
3681 goto onError;
3682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return NULL;
3687}
3688
Victor Stinner2f197072011-12-17 07:08:30 +01003689static size_t
3690mbstowcs_errorpos(const char *str, size_t len)
3691{
3692#ifdef HAVE_MBRTOWC
3693 const char *start = str;
3694 mbstate_t mbs;
3695 size_t converted;
3696 wchar_t ch;
3697
3698 memset(&mbs, 0, sizeof mbs);
3699 while (len)
3700 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003701 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003702 if (converted == 0)
3703 /* Reached end of string */
3704 break;
3705 if (converted == (size_t)-1 || converted == (size_t)-2) {
3706 /* Conversion error or incomplete character */
3707 return str - start;
3708 }
3709 else {
3710 str += converted;
3711 len -= converted;
3712 }
3713 }
3714 /* failed to find the undecodable byte sequence */
3715 return 0;
3716#endif
3717 return 0;
3718}
3719
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003722 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003723{
3724 wchar_t smallbuf[256];
3725 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3726 wchar_t *wstr;
3727 size_t wlen, wlen2;
3728 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003729 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003730 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003731 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003732 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003733
3734 if (locale_error_handler(errors, &surrogateescape) < 0)
3735 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003736
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003737 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3738 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739 return NULL;
3740 }
3741
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003742 if (surrogateescape) {
3743 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003744 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745 if (wstr == NULL) {
3746 if (wlen == (size_t)-1)
3747 PyErr_NoMemory();
3748 else
3749 PyErr_SetFromErrno(PyExc_OSError);
3750 return NULL;
3751 }
3752
3753 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003754 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003757 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758#ifndef HAVE_BROKEN_MBSTOWCS
3759 wlen = mbstowcs(NULL, str, 0);
3760#else
3761 wlen = len;
3762#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003763 if (wlen == (size_t)-1)
3764 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765 if (wlen+1 <= smallbuf_len) {
3766 wstr = smallbuf;
3767 }
3768 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003769 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 if (!wstr)
3771 return PyErr_NoMemory();
3772 }
3773
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003774 wlen2 = mbstowcs(wstr, str, wlen+1);
3775 if (wlen2 == (size_t)-1) {
3776 if (wstr != smallbuf)
3777 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003778 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003779 }
3780#ifdef HAVE_BROKEN_MBSTOWCS
3781 assert(wlen2 == wlen);
3782#endif
3783 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3784 if (wstr != smallbuf)
3785 PyMem_Free(wstr);
3786 }
3787 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003788
3789decode_error:
3790 errmsg = strerror(errno);
3791 assert(errmsg != NULL);
3792
3793 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003794 wstr = Py_DecodeLocale(errmsg, &errlen);
3795 if (wstr != NULL) {
3796 reason = PyUnicode_FromWideChar(wstr, errlen);
3797 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003798 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003799
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003800 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003801 reason = PyUnicode_FromString(
3802 "mbstowcs() encountered an invalid multibyte sequence");
3803 if (reason == NULL)
3804 return NULL;
3805
3806 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3807 "locale", str, len,
3808 (Py_ssize_t)error_pos,
3809 (Py_ssize_t)(error_pos+1),
3810 reason);
3811 Py_DECREF(reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003814 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003815 }
3816 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817}
3818
3819PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003820PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003821{
3822 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003823 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826
3827PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003828PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003829 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003830 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3831}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003832
Christian Heimes5894ba72007-11-04 11:43:14 +00003833PyObject*
3834PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3835{
Steve Dowercc16be82016-09-08 10:35:16 -07003836#if defined(__APPLE__)
3837 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003838#else
Victor Stinner793b5312011-04-27 00:24:21 +02003839 PyInterpreterState *interp = PyThreadState_GET()->interp;
3840 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3841 cannot use it to encode and decode filenames before it is loaded. Load
3842 the Python codec requires to encode at least its own filename. Use the C
3843 version of the locale codec until the codec registry is initialized and
3844 the Python codec is loaded.
3845
3846 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3847 cannot only rely on it: check also interp->fscodec_initialized for
3848 subinterpreters. */
3849 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003850 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003852 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003853 }
3854 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003855 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856 }
Victor Stinnerad158722010-10-27 00:25:46 +00003857#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003858}
3859
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860
3861int
3862PyUnicode_FSConverter(PyObject* arg, void* addr)
3863{
Brett Cannonec6ce872016-09-06 15:50:29 -07003864 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865 PyObject *output = NULL;
3866 Py_ssize_t size;
3867 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003868 if (arg == NULL) {
3869 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003870 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003871 return 1;
3872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 path = PyOS_FSPath(arg);
3874 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003875 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003876 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003877 if (PyBytes_Check(path)) {
3878 output = path;
3879 }
3880 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3881 output = PyUnicode_EncodeFSDefault(path);
3882 Py_DECREF(path);
3883 if (!output) {
3884 return 0;
3885 }
3886 assert(PyBytes_Check(output));
3887 }
3888
Victor Stinner0ea2a462010-04-30 00:22:08 +00003889 size = PyBytes_GET_SIZE(output);
3890 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003891 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003892 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003893 Py_DECREF(output);
3894 return 0;
3895 }
3896 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003897 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898}
3899
3900
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003901int
3902PyUnicode_FSDecoder(PyObject* arg, void* addr)
3903{
Brett Cannona5711202016-09-06 19:36:01 -07003904 int is_buffer = 0;
3905 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 if (arg == NULL) {
3908 Py_DECREF(*(PyObject**)addr);
3909 return 1;
3910 }
Brett Cannona5711202016-09-06 19:36:01 -07003911
3912 is_buffer = PyObject_CheckBuffer(arg);
3913 if (!is_buffer) {
3914 path = PyOS_FSPath(arg);
3915 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003916 return 0;
3917 }
Brett Cannona5711202016-09-06 19:36:01 -07003918 }
3919 else {
3920 path = arg;
3921 Py_INCREF(arg);
3922 }
3923
3924 if (PyUnicode_Check(path)) {
3925 if (PyUnicode_READY(path) == -1) {
3926 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003928 }
3929 output = path;
3930 }
3931 else if (PyBytes_Check(path) || is_buffer) {
3932 PyObject *path_bytes = NULL;
3933
3934 if (!PyBytes_Check(path) &&
3935 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3936 "path should be string, bytes, or os.PathLike, not %.200s",
3937 Py_TYPE(arg)->tp_name)) {
3938 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003939 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003940 }
3941 path_bytes = PyBytes_FromObject(path);
3942 Py_DECREF(path);
3943 if (!path_bytes) {
3944 return 0;
3945 }
3946 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3947 PyBytes_GET_SIZE(path_bytes));
3948 Py_DECREF(path_bytes);
3949 if (!output) {
3950 return 0;
3951 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003952 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 else {
3954 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003955 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003956 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003957 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003958 return 0;
3959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003960 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003961 Py_DECREF(output);
3962 return 0;
3963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003965 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003966 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 Py_DECREF(output);
3968 return 0;
3969 }
3970 *(PyObject**)addr = output;
3971 return Py_CLEANUP_SUPPORTED;
3972}
3973
3974
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003975const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977{
Christian Heimesf3863112007-11-22 07:46:41 +00003978 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003980 if (!PyUnicode_Check(unicode)) {
3981 PyErr_BadArgument();
3982 return NULL;
3983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003987 if (PyUnicode_UTF8(unicode) == NULL) {
3988 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003989 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 if (bytes == NULL)
3991 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003992 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3993 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003994 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 Py_DECREF(bytes);
3996 return NULL;
3997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003999 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004000 PyBytes_AS_STRING(bytes),
4001 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 Py_DECREF(bytes);
4003 }
4004
4005 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 *psize = PyUnicode_UTF8_LENGTH(unicode);
4007 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004008}
4009
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004010const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4014}
4015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016Py_UNICODE *
4017PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 const unsigned char *one_byte;
4020#if SIZEOF_WCHAR_T == 4
4021 const Py_UCS2 *two_bytes;
4022#else
4023 const Py_UCS4 *four_bytes;
4024 const Py_UCS4 *ucs4_end;
4025 Py_ssize_t num_surrogates;
4026#endif
4027 wchar_t *w;
4028 wchar_t *wchar_end;
4029
4030 if (!PyUnicode_Check(unicode)) {
4031 PyErr_BadArgument();
4032 return NULL;
4033 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 assert(_PyUnicode_KIND(unicode) != 0);
4037 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004041 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4042 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 num_surrogates = 0;
4044
4045 for (; four_bytes < ucs4_end; ++four_bytes) {
4046 if (*four_bytes > 0xFFFF)
4047 ++num_surrogates;
4048 }
4049
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4051 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4052 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 PyErr_NoMemory();
4054 return NULL;
4055 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004058 w = _PyUnicode_WSTR(unicode);
4059 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4060 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4062 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004063 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004065 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4066 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 }
4068 else
4069 *w = *four_bytes;
4070
4071 if (w > wchar_end) {
4072 assert(0 && "Miscalculated string end");
4073 }
4074 }
4075 *w = 0;
4076#else
4077 /* sizeof(wchar_t) == 4 */
4078 Py_FatalError("Impossible unicode object state, wstr and str "
4079 "should share memory already.");
4080 return NULL;
4081#endif
4082 }
4083 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004084 if ((size_t)_PyUnicode_LENGTH(unicode) >
4085 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4086 PyErr_NoMemory();
4087 return NULL;
4088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004089 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4090 (_PyUnicode_LENGTH(unicode) + 1));
4091 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 PyErr_NoMemory();
4093 return NULL;
4094 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004095 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4096 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4097 w = _PyUnicode_WSTR(unicode);
4098 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4101 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 for (; w < wchar_end; ++one_byte, ++w)
4103 *w = *one_byte;
4104 /* null-terminate the wstr */
4105 *w = 0;
4106 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004107 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004109 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 for (; w < wchar_end; ++two_bytes, ++w)
4111 *w = *two_bytes;
4112 /* null-terminate the wstr */
4113 *w = 0;
4114#else
4115 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004116 PyObject_FREE(_PyUnicode_WSTR(unicode));
4117 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 Py_FatalError("Impossible unicode object state, wstr "
4119 "and str should share memory already.");
4120 return NULL;
4121#endif
4122 }
4123 else {
4124 assert(0 && "This should never happen.");
4125 }
4126 }
4127 }
4128 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004129 *size = PyUnicode_WSTR_LENGTH(unicode);
4130 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004131}
4132
Alexander Belopolsky40018472011-02-26 01:02:56 +00004133Py_UNICODE *
4134PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137}
4138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139
Alexander Belopolsky40018472011-02-26 01:02:56 +00004140Py_ssize_t
4141PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142{
4143 if (!PyUnicode_Check(unicode)) {
4144 PyErr_BadArgument();
4145 goto onError;
4146 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004147 if (_PyUnicode_WSTR(unicode) == NULL) {
4148 if (PyUnicode_AsUnicode(unicode) == NULL)
4149 goto onError;
4150 }
4151 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 return -1;
4155}
4156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157Py_ssize_t
4158PyUnicode_GetLength(PyObject *unicode)
4159{
Victor Stinner07621332012-06-16 04:53:46 +02004160 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 PyErr_BadArgument();
4162 return -1;
4163 }
Victor Stinner07621332012-06-16 04:53:46 +02004164 if (PyUnicode_READY(unicode) == -1)
4165 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 return PyUnicode_GET_LENGTH(unicode);
4167}
4168
4169Py_UCS4
4170PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4171{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004172 void *data;
4173 int kind;
4174
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004175 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4176 PyErr_BadArgument();
4177 return (Py_UCS4)-1;
4178 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004179 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004180 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 return (Py_UCS4)-1;
4182 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004183 data = PyUnicode_DATA(unicode);
4184 kind = PyUnicode_KIND(unicode);
4185 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186}
4187
4188int
4189PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4190{
4191 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004192 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return -1;
4194 }
Victor Stinner488fa492011-12-12 00:01:39 +01004195 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004196 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004197 PyErr_SetString(PyExc_IndexError, "string index out of range");
4198 return -1;
4199 }
Victor Stinner488fa492011-12-12 00:01:39 +01004200 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004201 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004202 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4203 PyErr_SetString(PyExc_ValueError, "character out of range");
4204 return -1;
4205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4207 index, ch);
4208 return 0;
4209}
4210
Alexander Belopolsky40018472011-02-26 01:02:56 +00004211const char *
4212PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004213{
Victor Stinner42cb4622010-09-01 19:39:01 +00004214 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004215}
4216
Victor Stinner554f3f02010-06-16 23:33:54 +00004217/* create or adjust a UnicodeDecodeError */
4218static void
4219make_decode_exception(PyObject **exceptionObject,
4220 const char *encoding,
4221 const char *input, Py_ssize_t length,
4222 Py_ssize_t startpos, Py_ssize_t endpos,
4223 const char *reason)
4224{
4225 if (*exceptionObject == NULL) {
4226 *exceptionObject = PyUnicodeDecodeError_Create(
4227 encoding, input, length, startpos, endpos, reason);
4228 }
4229 else {
4230 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4231 goto onError;
4232 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4233 goto onError;
4234 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4235 goto onError;
4236 }
4237 return;
4238
4239onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004240 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004241}
4242
Steve Dowercc16be82016-09-08 10:35:16 -07004243#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* error handling callback helper:
4245 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004246 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 and adjust various state variables.
4248 return 0 on success, -1 on error
4249*/
4250
Alexander Belopolsky40018472011-02-26 01:02:56 +00004251static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252unicode_decode_call_errorhandler_wchar(
4253 const char *errors, PyObject **errorHandler,
4254 const char *encoding, const char *reason,
4255 const char **input, const char **inend, Py_ssize_t *startinpos,
4256 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4257 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004259 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260
4261 PyObject *restuple = NULL;
4262 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004263 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t requiredsize;
4266 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004267 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 wchar_t *repwstr;
4269 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4272 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 *errorHandler = PyCodec_LookupError(errors);
4276 if (*errorHandler == NULL)
4277 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 }
4279
Victor Stinner554f3f02010-06-16 23:33:54 +00004280 make_decode_exception(exceptionObject,
4281 encoding,
4282 *input, *inend - *input,
4283 *startinpos, *endinpos,
4284 reason);
4285 if (*exceptionObject == NULL)
4286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004288 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004292 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297
4298 /* Copy back the bytes variables, which might have been modified by the
4299 callback */
4300 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4301 if (!inputobj)
4302 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *input = PyBytes_AS_STRING(inputobj);
4304 insize = PyBytes_GET_SIZE(inputobj);
4305 *inend = *input + insize;
4306 /* we can DECREF safely, as the exception has another reference,
4307 so the object won't go away. */
4308 Py_DECREF(inputobj);
4309
4310 if (newpos<0)
4311 newpos = insize+newpos;
4312 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004313 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 goto onError;
4315 }
4316
4317 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4318 if (repwstr == NULL)
4319 goto onError;
4320 /* need more space? (at least enough for what we
4321 have+the replacement+the rest of the string (starting
4322 at the new input position), so we won't have to check space
4323 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004324 requiredsize = *outpos;
4325 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4326 goto overflow;
4327 requiredsize += repwlen;
4328 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4329 goto overflow;
4330 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004332 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 requiredsize = 2*outsize;
4334 if (unicode_resize(output, requiredsize) < 0)
4335 goto onError;
4336 }
4337 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4338 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 *endinpos = newpos;
4340 *inptr = *input + newpos;
4341
4342 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004343 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 return 0;
4345
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 overflow:
4347 PyErr_SetString(PyExc_OverflowError,
4348 "decoded result is too long for a Python string");
4349
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 onError:
4351 Py_XDECREF(restuple);
4352 return -1;
4353}
Steve Dowercc16be82016-09-08 10:35:16 -07004354#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355
4356static int
4357unicode_decode_call_errorhandler_writer(
4358 const char *errors, PyObject **errorHandler,
4359 const char *encoding, const char *reason,
4360 const char **input, const char **inend, Py_ssize_t *startinpos,
4361 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365
4366 PyObject *restuple = NULL;
4367 PyObject *repunicode = NULL;
4368 Py_ssize_t insize;
4369 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004370 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 PyObject *inputobj = NULL;
4372
4373 if (*errorHandler == NULL) {
4374 *errorHandler = PyCodec_LookupError(errors);
4375 if (*errorHandler == NULL)
4376 goto onError;
4377 }
4378
4379 make_decode_exception(exceptionObject,
4380 encoding,
4381 *input, *inend - *input,
4382 *startinpos, *endinpos,
4383 reason);
4384 if (*exceptionObject == NULL)
4385 goto onError;
4386
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004387 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 if (restuple == NULL)
4389 goto onError;
4390 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004391 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 goto onError;
4393 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004394 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004396
4397 /* Copy back the bytes variables, which might have been modified by the
4398 callback */
4399 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4400 if (!inputobj)
4401 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004402 *input = PyBytes_AS_STRING(inputobj);
4403 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004405 /* we can DECREF safely, as the exception has another reference,
4406 so the object won't go away. */
4407 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004411 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Victor Stinner170ca6f2013-04-18 00:25:28 +02004416 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004417 if (replen > 1) {
4418 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004419 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422 goto onError;
4423 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004425 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004428 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004431 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437}
4438
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439/* --- UTF-7 Codec -------------------------------------------------------- */
4440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442
4443/* Three simple macros defining base-64. */
4444
4445/* Is c a base-64 character? */
4446
4447#define IS_BASE64(c) \
4448 (((c) >= 'A' && (c) <= 'Z') || \
4449 ((c) >= 'a' && (c) <= 'z') || \
4450 ((c) >= '0' && (c) <= '9') || \
4451 (c) == '+' || (c) == '/')
4452
4453/* given that c is a base-64 character, what is its base-64 value? */
4454
4455#define FROM_BASE64(c) \
4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459 (c) == '+' ? 62 : 63)
4460
4461/* What is the base-64 character of the bottom 6 bits of n? */
4462
4463#define TO_BASE64(n) \
4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself. We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471#define DECODE_DIRECT(c) \
4472 ((c) <= 127 && (c) != '+')
4473
4474/* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above). See RFC2152. This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 * alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 * !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 * ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489char utf7_category[128] = {
4490/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494/* sp ! " # $ % & ' ( ) * + , - . / */
4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498/* @ A B C D E F G H I J K L M N O */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502/* ` a b c d e f g h i j k l m n o */
4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504/* p q r s t u v w x y z { | } ~ del */
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506};
4507
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508/* ENCODE_DIRECT: this character should be encoded as itself. The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself. RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514#define ENCODE_DIRECT(c, directO, directWS) \
4515 ((c) < 128 && (c) > 0 && \
4516 ((utf7_category[(c)] == 0) || \
4517 (directWS && (utf7_category[(c)] == 2)) || \
4518 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Alexander Belopolsky40018472011-02-26 01:02:56 +00004520PyObject *
4521PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004522 Py_ssize_t size,
4523 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526}
4527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528/* The decoder. The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed. So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
4536PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t startinpos;
4543 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 const char *errmsg = "";
4547 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004548 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 unsigned int base64bits = 0;
4550 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004551 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *errorHandler = NULL;
4553 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004558 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004559 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004562 _PyUnicodeWriter_Init(&writer);
4563 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 e = s + size;
4567
4568 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004569 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004571 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (inShift) { /* in a base-64 section */
4574 if (IS_BASE64(ch)) { /* consume a base-64 character */
4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576 base64bits += 6;
4577 s++;
4578 if (base64bits >= 16) {
4579 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 base64bits -= 16;
4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004583 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (surrogate) {
4585 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 }
4593 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004595 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
4598 }
Victor Stinner551ac952011-11-29 22:58:13 +01004599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* first surrogate */
4601 surrogate = outCh;
4602 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
4608 }
4609 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 if (base64bits > 0) { /* left-over bits */
4612 if (base64bits >= 6) {
4613 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004614 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 errmsg = "partial character in shift sequence";
4616 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 else {
4619 /* Some bits remain; they should be zero */
4620 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 errmsg = "non-zero padding bits in shift sequence";
4623 goto utf7Error;
4624 }
4625 }
4626 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 if (surrogate && DECODE_DIRECT(ch)) {
4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629 goto onError;
4630 }
4631 surrogate = 0;
4632 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 /* '-' is absorbed; other terminating
4634 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004635 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
4638 }
4639 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 s++; /* consume '+' */
4642 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004645 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 }
4647 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004652 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 else {
4661 startinpos = s-starts;
4662 s++;
4663 errmsg = "unexpected special character";
4664 goto utf7Error;
4665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 errors, &errorHandler,
4671 "utf7", errmsg,
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
4676
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 /* end of string */
4678
4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004681 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 if (surrogate ||
4683 (base64bits >= 6) ||
4684 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 errors, &errorHandler,
4688 "utf7", "unterminated shift sequence",
4689 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 goto onError;
4692 if (s < e)
4693 goto restart;
4694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696
4697 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004701 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.kind, writer.data, shiftOutStart);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 _PyUnicodeWriter_Dealloc(&writer);
4707 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004708 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004709 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 }
4711 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724 return NULL;
4725}
4726
4727
Alexander Belopolsky40018472011-02-26 01:02:56 +00004728PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729_PyUnicode_EncodeUTF7(PyObject *str,
4730 int base64SetO,
4731 int base64WhiteSpace,
4732 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 int kind;
4735 void *data;
4736 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 char * out;
4743 char * start;
4744
Benjamin Petersonbac79492012-01-14 13:34:47 -05004745 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 return NULL;
4747 kind = PyUnicode_KIND(str);
4748 data = PyUnicode_DATA(str);
4749 len = PyUnicode_GET_LENGTH(str);
4750
4751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004755 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004756 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004757 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 if (v == NULL)
4759 return NULL;
4760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004761 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 if (inShift) {
4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767 /* shifting out */
4768 if (base64bits) { /* output remaining bits */
4769 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770 base64buffer = 0;
4771 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
4773 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 /* Characters not in the BASE64 set implicitly unshift the sequence
4775 so no '-' is required, except if the character is itself a '-' */
4776 if (IS_BASE64(ch) || ch == '-') {
4777 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 *out++ = (char) ch;
4780 }
4781 else {
4782 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004783 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 else { /* not in a shift sequence */
4786 if (ch == '+') {
4787 *out++ = '+';
4788 *out++ = '-';
4789 }
4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 *out++ = '+';
4795 inShift = 1;
4796 goto encode_char;
4797 }
4798 }
4799 continue;
4800encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004802 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004803
Antoine Pitrou244651a2009-05-04 18:56:13 +00004804 /* code first surrogate */
4805 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 while (base64bits >= 6) {
4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809 base64bits -= 6;
4810 }
4811 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004812 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 base64bits += 16;
4815 base64buffer = (base64buffer << 16) | ch;
4816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004820 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (base64bits)
4822 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004825 if (_PyBytes_Resize(&v, out - start) < 0)
4826 return NULL;
4827 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004829PyObject *
4830PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831 Py_ssize_t size,
4832 int base64SetO,
4833 int base64WhiteSpace,
4834 const char *errors)
4835{
4836 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004837 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838 if (tmp == NULL)
4839 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841 base64WhiteSpace, errors);
4842 Py_DECREF(tmp);
4843 return result;
4844}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846#undef IS_BASE64
4847#undef FROM_BASE64
4848#undef TO_BASE64
4849#undef DECODE_DIRECT
4850#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852/* --- UTF-8 Codec -------------------------------------------------------- */
4853
Alexander Belopolsky40018472011-02-26 01:02:56 +00004854PyObject *
4855PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004856 Py_ssize_t size,
4857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
Walter Dörwald69652032004-09-07 20:24:22 +00004859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860}
4861
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862#include "stringlib/asciilib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004866#include "stringlib/ucs1lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs2lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
4874#include "stringlib/ucs4lib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrouab868312009-01-10 15:40:25 +00004878/* Mask to quickly check whether a C 'long' contains a
4879 non-ASCII, UTF8-encoded char. */
4880#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004881# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004882#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004883# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004884#else
4885# error C 'long' size should be either 4 or 8!
4886#endif
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888static Py_ssize_t
4889ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004894 /*
4895 * Issue #17237: m68k is a bit different from most architectures in
4896 * that objects do not use "natural alignment" - for example, int and
4897 * long are only aligned at 2-byte boundaries. Therefore the assert()
4898 * won't work; also, tests have shown that skipping the "optimised
4899 * version" will even speed up m68k.
4900 */
4901#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 /* Fast path, see in STRINGLIB(utf8_decode) for
4906 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004907 /* Help allocation */
4908 const char *_p = p;
4909 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (_p < aligned_end) {
4911 unsigned long value = *(const unsigned long *) _p;
4912 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 *((unsigned long *)q) = value;
4915 _p += SIZEOF_LONG;
4916 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 p = _p;
4919 while (p < end) {
4920 if ((unsigned char)*p & 0x80)
4921 break;
4922 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004927#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (p < end) {
4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004932 /* Help allocation */
4933 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 while (_p < aligned_end) {
4935 unsigned long value = *(unsigned long *) _p;
4936 if (value & ASCII_CHAR_MASK)
4937 break;
4938 _p += SIZEOF_LONG;
4939 }
4940 p = _p;
4941 if (_p == end)
4942 break;
4943 }
4944 if ((unsigned char)*p & 0x80)
4945 break;
4946 ++p;
4947 }
4948 memcpy(dest, start, p - start);
4949 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950}
Antoine Pitrouab868312009-01-10 15:40:25 +00004951
Victor Stinner785938e2011-12-11 20:09:03 +01004952PyObject *
4953PyUnicode_DecodeUTF8Stateful(const char *s,
4954 Py_ssize_t size,
4955 const char *errors,
4956 Py_ssize_t *consumed)
4957{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004959 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004968
4969 if (size == 0) {
4970 if (consumed)
4971 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004973 }
4974
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004977 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 *consumed = 1;
4979 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004980 }
4981
Victor Stinner8f674cc2013-04-17 23:02:17 +02004982 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004983 writer.min_length = size;
4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004986
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 writer.pos = ascii_decode(s, end, writer.data);
4988 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 while (s < end) {
4990 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004992
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 if (PyUnicode_IS_ASCII(writer.buffer))
4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 } else {
5001 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 }
5004
5005 switch (ch) {
5006 case 0:
5007 if (s == end || consumed)
5008 goto End;
5009 errmsg = "unexpected end of data";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 case 1:
5014 errmsg = "invalid start byte";
5015 startinpos = s - starts;
5016 endinpos = startinpos + 1;
5017 break;
5018 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
5032 error_handler = get_error_handler(errors);
5033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Xavier de Gaye76febd02016-12-15 20:59:58 +01005086#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005087
5088/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005089 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005090
5091 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093
5094wchar_t*
5095_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 wchar_t *unicode;
5099 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100
5101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 if (!unicode)
5107 return NULL;
5108
5109 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 if (ch > 0xFF) {
5120#if SIZEOF_WCHAR_T == 4
5121 assert(0);
5122#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005123 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 /* compute and append the two surrogates: */
5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 else {
5130 if (!ch && s == e)
5131 break;
5132 /* surrogateescape */
5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 return unicode;
5138}
5139
Xavier de Gaye76febd02016-12-15 20:59:58 +01005140#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142/* Primary internal function which creates utf8 encoded bytes objects.
5143
5144 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005145 and allocate exactly as much space needed at the end. Else allocate the
5146 maximum possible needed (4 result bytes per Unicode character), and return
5147 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005148*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005149PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005150_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151{
Victor Stinner6099a032011-12-18 14:22:26 +01005152 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 void *data;
5154 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160
5161 if (PyUnicode_READY(unicode) == -1)
5162 return NULL;
5163
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005164 if (PyUnicode_UTF8(unicode))
5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167
5168 kind = PyUnicode_KIND(unicode);
5169 data = PyUnicode_DATA(unicode);
5170 size = PyUnicode_GET_LENGTH(unicode);
5171
Benjamin Petersonead6b532011-12-20 17:23:42 -06005172 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005173 default:
5174 assert(0);
5175 case PyUnicode_1BYTE_KIND:
5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177 assert(!PyUnicode_IS_ASCII(unicode));
5178 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179 case PyUnicode_2BYTE_KIND:
5180 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181 case PyUnicode_4BYTE_KIND:
5182 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188 Py_ssize_t size,
5189 const char *errors)
5190{
5191 PyObject *v, *unicode;
5192
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005193 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005194 if (unicode == NULL)
5195 return NULL;
5196 v = _PyUnicode_AsUTF8String(unicode, errors);
5197 Py_DECREF(unicode);
5198 return v;
5199}
5200
5201PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207/* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214{
5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216}
5217
5218PyObject *
5219PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
5225 const char *starts = s;
5226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005228 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005229 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005230 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005231 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 q = (unsigned char *)s;
5237 e = q + size;
5238
5239 if (byteorder)
5240 bo = *byteorder;
5241
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output
5245 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (bom == 0x0000FEFF) {
5249 bo = -1;
5250 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005252 else if (bom == 0xFFFE0000) {
5253 bo = 1;
5254 q += 4;
5255 }
5256 if (byteorder)
5257 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 }
5259
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (q == e) {
5261 if (consumed)
5262 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005263 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 }
5265
Victor Stinnere64322e2012-10-30 23:12:47 +01005266#ifdef WORDS_BIGENDIAN
5267 le = bo < 0;
5268#else
5269 le = bo <= 0;
5270#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005271 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005272
Victor Stinner8f674cc2013-04-17 23:02:17 +02005273 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005274 writer.min_length = (e - q + 3) / 4;
5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005276 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 while (1) {
5279 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 enum PyUnicode_Kind kind = writer.kind;
5284 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005286 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (le) {
5288 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 if (ch > maxch)
5291 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 q += 4;
5297 } while (q <= last);
5298 }
5299 else {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 }
5313
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005314 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 startinpos = ((const char *)q) - starts;
5317 endinpos = startinpos + 4;
5318 }
5319 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 startinpos = ((const char *)q) - starts;
5325 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 else {
5328 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 goto onError;
5331 q += 4;
5332 continue;
5333 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338
5339 /* The remaining input chars are ignored if the callback
5340 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005341 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347 }
5348
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 return NULL;
5361}
5362
5363PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364_PyUnicode_EncodeUTF32(PyObject *str,
5365 const char *errors,
5366 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 enum PyUnicode_Kind kind;
5369 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005371 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005372 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005373#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005378 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 if (!PyUnicode_Check(str)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005388 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 return NULL;
5390 kind = PyUnicode_KIND(str);
5391 data = PyUnicode_DATA(str);
5392 len = PyUnicode_GET_LENGTH(str);
5393
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005395 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005397 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398 if (v == NULL)
5399 return NULL;
5400
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 /* output buffer is 4-bytes aligned */
5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005403 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005406 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005412 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 else
5414 encoding = "utf-32";
5415
5416 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 pos = 0;
5422 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424
5425 if (kind == PyUnicode_2BYTE_KIND) {
5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 else {
5430 assert(kind == PyUnicode_4BYTE_KIND);
5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432 &out, native_ordering);
5433 }
5434 if (pos == len)
5435 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 rep = unicode_encode_call_errorhandler(
5438 errors, &errorHandler,
5439 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 if (!rep)
5442 goto error;
5443
5444 if (PyBytes_Check(rep)) {
5445 repsize = PyBytes_GET_SIZE(rep);
5446 if (repsize & 3) {
5447 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 "surrogates not allowed");
5450 goto error;
5451 }
5452 moreunits = repsize / 4;
5453 }
5454 else {
5455 assert(PyUnicode_Check(rep));
5456 if (PyUnicode_READY(rep) < 0)
5457 goto error;
5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459 if (!PyUnicode_IS_ASCII(rep)) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 }
5466
5467 /* four bytes are reserved for each surrogate */
5468 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 Py_ssize_t morebytes = 4 * (moreunits - 1);
5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472 /* integer overflow */
5473 PyErr_NoMemory();
5474 goto error;
5475 }
5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 }
5480
5481 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005482 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 Py_CLEAR(rep);
5491 }
5492
5493 /* Cut back to size actually needed. This is necessary for, for example,
5494 encoding of a string containing isolated surrogates and the 'ignore'
5495 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 if (nsize != PyBytes_GET_SIZE(v))
5498 _PyBytes_Resize(&v, nsize);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 error:
5504 Py_XDECREF(rep);
5505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
5507 Py_XDECREF(v);
5508 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005509}
5510
Alexander Belopolsky40018472011-02-26 01:02:56 +00005511PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513 Py_ssize_t size,
5514 const char *errors,
5515 int byteorder)
5516{
5517 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005518 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005519 if (tmp == NULL)
5520 return NULL;
5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522 Py_DECREF(tmp);
5523 return result;
5524}
5525
5526PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005527PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528{
Victor Stinnerb960b342011-11-20 19:12:52 +01005529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530}
5531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532/* --- UTF-16 Codec ------------------------------------------------------- */
5533
Tim Peters772747b2001-08-09 22:21:55 +00005534PyObject *
5535PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_ssize_t size,
5537 const char *errors,
5538 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539{
Walter Dörwald69652032004-09-07 20:24:22 +00005540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541}
5542
5543PyObject *
5544PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder,
5548 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t startinpos;
5552 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005553 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005555 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005556 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005557 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 PyObject *errorHandler = NULL;
5559 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Tim Peters772747b2001-08-09 22:21:55 +00005562 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
5565 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005566 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (bo == 0 && size >= 2) {
5573 const Py_UCS4 bom = (q[1] << 8) | q[0];
5574 if (bom == 0xFEFF) {
5575 q += 2;
5576 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 else if (bom == 0xFFFE) {
5579 q += 2;
5580 bo = 1;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005589 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005590 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005591
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005595#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005598#endif
Tim Peters772747b2001-08-09 22:21:55 +00005599
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 /* Note: size will always be longer than the resulting Unicode
5601 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005602 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005603 writer.min_length = (e - q + 1) / 2;
5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 while (1) {
5608 Py_UCS4 ch = 0;
5609 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering);
5616 else
5617 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 native_ordering);
5620 } else if (kind == PyUnicode_2BYTE_KIND) {
5621 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 native_ordering);
5624 } else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005629 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 switch (ch)
5633 {
5634 case 0:
5635 /* remaining byte at the end? (size should be even) */
5636 if (q == e || consumed)
5637 goto End;
5638 errmsg = "truncated data";
5639 startinpos = ((const char *)q) - starts;
5640 endinpos = ((const char *)e) - starts;
5641 break;
5642 /* The remaining input chars are ignored if the callback
5643 chooses to skip the input */
5644 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005645 q -= 2;
5646 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005647 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005649 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005650 endinpos = ((const char *)e) - starts;
5651 break;
5652 case 2:
5653 errmsg = "illegal encoding";
5654 startinpos = ((const char *)q) - 2 - starts;
5655 endinpos = startinpos + 2;
5656 break;
5657 case 3:
5658 errmsg = "illegal UTF-16 surrogate";
5659 startinpos = ((const char *)q) - 4 - starts;
5660 endinpos = startinpos + 2;
5661 break;
5662 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 continue;
5666 }
5667
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005669 errors,
5670 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005672 &starts,
5673 (const char **)&e,
5674 &startinpos,
5675 &endinpos,
5676 &exc,
5677 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
5681
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682End:
Walter Dörwald69652032004-09-07 20:24:22 +00005683 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 Py_XDECREF(errorHandler);
5693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return NULL;
5695}
5696
Tim Peters772747b2001-08-09 22:21:55 +00005697PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698_PyUnicode_EncodeUTF16(PyObject *str,
5699 const char *errors,
5700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005702 enum PyUnicode_Kind kind;
5703 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005704 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005706 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005708#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005710#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005712#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 const char *encoding;
5714 Py_ssize_t nsize, pos;
5715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
5717 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 if (kind == PyUnicode_4BYTE_KIND) {
5731 const Py_UCS4 *in = (const Py_UCS4 *)data;
5732 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 while (in < end) {
5734 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005736 }
5737 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 nsize = len + pairs + (byteorder == 0);
5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005752 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
5754 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005755 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Tim Peters772747b2001-08-09 22:21:55 +00005757
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (kind == PyUnicode_1BYTE_KIND) {
5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005761 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005762
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772
5773 pos = 0;
5774 while (pos < len) {
5775 Py_ssize_t repsize, moreunits;
5776
5777 if (kind == PyUnicode_2BYTE_KIND) {
5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 else {
5782 assert(kind == PyUnicode_4BYTE_KIND);
5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784 &out, native_ordering);
5785 }
5786 if (pos == len)
5787 break;
5788
5789 rep = unicode_encode_call_errorhandler(
5790 errors, &errorHandler,
5791 encoding, "surrogates not allowed",
5792 str, &exc, pos, pos + 1, &pos);
5793 if (!rep)
5794 goto error;
5795
5796 if (PyBytes_Check(rep)) {
5797 repsize = PyBytes_GET_SIZE(rep);
5798 if (repsize & 1) {
5799 raise_encode_exception(&exc, encoding,
5800 str, pos - 1, pos,
5801 "surrogates not allowed");
5802 goto error;
5803 }
5804 moreunits = repsize / 2;
5805 }
5806 else {
5807 assert(PyUnicode_Check(rep));
5808 if (PyUnicode_READY(rep) < 0)
5809 goto error;
5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811 if (!PyUnicode_IS_ASCII(rep)) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 }
5818
5819 /* two bytes are reserved for each surrogate */
5820 if (moreunits > 1) {
5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822 Py_ssize_t morebytes = 2 * (moreunits - 1);
5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829 goto error;
5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831 }
5832
5833 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 out += moreunits;
5836 } else /* rep is unicode */ {
5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
5840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore' handler
5847 is used. */
5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
5861#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862}
5863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866 Py_ssize_t size,
5867 const char *errors,
5868 int byteorder)
5869{
5870 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005871 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 if (tmp == NULL)
5873 return NULL;
5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875 Py_DECREF(tmp);
5876 return result;
5877}
5878
5879PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885/* --- Unicode Escape Codec ----------------------------------------------- */
5886
Fredrik Lundh06d12682001-01-24 07:59:11 +00005887static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005890_PyUnicode_DecodeUnicodeEscape(const char *s,
5891 Py_ssize_t size,
5892 const char *errors,
5893 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 PyObject *errorHandler = NULL;
5899 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005900
Eric V. Smith42454af2016-10-31 09:22:08 -04005901 // so we can remember if we've seen an invalid escape char or not
5902 *first_invalid_escape = NULL;
5903
Victor Stinner62ec3312016-09-06 17:04:34 -07005904 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005905 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005906 }
5907 /* Escaped strings will always be longer than the resulting
5908 Unicode string, so we start with size here and then reduce the
5909 length after conversion to the true value.
5910 (but if the error callback returns a long replacement string
5911 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005912 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005913 writer.min_length = size;
5914 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916 }
5917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 end = s + size;
5919 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005920 unsigned char c = (unsigned char) *s++;
5921 Py_UCS4 ch;
5922 int count;
5923 Py_ssize_t startinpos;
5924 Py_ssize_t endinpos;
5925 const char *message;
5926
5927#define WRITE_ASCII_CHAR(ch) \
5928 do { \
5929 assert(ch <= 127); \
5930 assert(writer.pos < writer.size); \
5931 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932 } while(0)
5933
5934#define WRITE_CHAR(ch) \
5935 do { \
5936 if (ch <= writer.maxchar) { \
5937 assert(writer.pos < writer.size); \
5938 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939 } \
5940 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941 goto onError; \
5942 } \
5943 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
5945 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005946 if (c != '\\') {
5947 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 continue;
5949 }
5950
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005953 if (s >= end) {
5954 message = "\\ at end of string";
5955 goto error;
5956 }
5957 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005960 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 case '\n': continue;
5964 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965 case '\'': WRITE_ASCII_CHAR('\''); continue;
5966 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970 case 't': WRITE_ASCII_CHAR('\t'); continue;
5971 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005976 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 case '0': case '1': case '2': case '3':
5980 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 ch = (ch<<3) + *s++ - '0';
5984 if (s < end && '0' <= *s && *s <= '7') {
5985 ch = (ch<<3) + *s++ - '0';
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 WRITE_CHAR(ch);
5989 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* hex escapes */
5992 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005995 message = "truncated \\xXX escape";
5996 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006001 message = "truncated \\uXXXX escape";
6002 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006005 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 message = "truncated \\UXXXXXXXX escape";
6008 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006010 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 ch <<= 4;
6012 if (c >= '0' && c <= '9') {
6013 ch += c - '0';
6014 }
6015 else if (c >= 'a' && c <= 'f') {
6016 ch += c - ('a' - 10);
6017 }
6018 else if (c >= 'A' && c <= 'F') {
6019 ch += c - ('A' - 10);
6020 }
6021 else {
6022 break;
6023 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006024 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006025 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006026 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 }
6028
6029 /* when we get here, ch is a 32-bit unicode character */
6030 if (ch > MAX_UNICODE) {
6031 message = "illegal Unicode character";
6032 goto error;
6033 }
6034
6035 WRITE_CHAR(ch);
6036 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 if (ucnhash_CAPI == NULL) {
6041 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006042 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 if (ucnhash_CAPI == NULL) {
6045 PyErr_SetString(
6046 PyExc_UnicodeError,
6047 "\\N escapes not supported (can't load unicodedata module)"
6048 );
6049 goto onError;
6050 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006052
6053 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 const char *start = ++s;
6056 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006058 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 namelen = s - start;
6061 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 ch = 0xffffffff; /* in case 'getcode' messes up */
6065 if (namelen <= INT_MAX &&
6066 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067 &ch, 0)) {
6068 assert(ch <= MAX_UNICODE);
6069 WRITE_CHAR(ch);
6070 continue;
6071 }
6072 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 }
6074 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006075 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076
6077 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006078 if (*first_invalid_escape == NULL) {
6079 *first_invalid_escape = s-1; /* Back up one char, since we've
6080 already incremented s. */
6081 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 WRITE_ASCII_CHAR('\\');
6083 WRITE_CHAR(c);
6084 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086
6087 error:
6088 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006090 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006091 errors, &errorHandler,
6092 "unicodeescape", message,
6093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 }
6097 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098 goto onError;
6099 }
6100
6101#undef WRITE_ASCII_CHAR
6102#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006104
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 return NULL;
6114}
6115
Eric V. Smith42454af2016-10-31 09:22:08 -04006116PyObject *
6117PyUnicode_DecodeUnicodeEscape(const char *s,
6118 Py_ssize_t size,
6119 const char *errors)
6120{
6121 const char *first_invalid_escape;
6122 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123 &first_invalid_escape);
6124 if (result == NULL)
6125 return NULL;
6126 if (first_invalid_escape != NULL) {
6127 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128 "invalid escape sequence '\\%c'",
6129 *first_invalid_escape) < 0) {
6130 Py_DECREF(result);
6131 return NULL;
6132 }
6133 }
6134 return result;
6135}
6136
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006137/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Alexander Belopolsky40018472011-02-26 01:02:56 +00006139PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006150 escape.
6151
Ezio Melottie7f90372012-10-05 03:33:31 +03006152 For UCS1 strings it's '\xxx', 4 bytes per source character.
6153 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006155 */
6156
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 if (!PyUnicode_Check(unicode)) {
6158 PyErr_BadArgument();
6159 return NULL;
6160 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 }
Victor Stinner358af132015-10-12 22:36:57 +02006164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 if (len == 0) {
6167 return PyBytes_FromStringAndSize(NULL, 0);
6168 }
6169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173 bytes, and 1 byte characters 4. */
6174 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006175 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 return PyErr_NoMemory();
6177 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006178 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 if (repr == NULL) {
6180 return NULL;
6181 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006185 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 /* U+0000-U+00ff range */
6188 if (ch < 0x100) {
6189 if (ch >= ' ' && ch < 127) {
6190 if (ch != '\\') {
6191 /* Copy printable US ASCII as-is */
6192 *p++ = (char) ch;
6193 }
6194 /* Escape backslashes */
6195 else {
6196 *p++ = '\\';
6197 *p++ = '\\';
6198 }
6199 }
Victor Stinner358af132015-10-12 22:36:57 +02006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 /* Map special whitespace to '\t', \n', '\r' */
6202 else if (ch == '\t') {
6203 *p++ = '\\';
6204 *p++ = 't';
6205 }
6206 else if (ch == '\n') {
6207 *p++ = '\\';
6208 *p++ = 'n';
6209 }
6210 else if (ch == '\r') {
6211 *p++ = '\\';
6212 *p++ = 'r';
6213 }
6214
6215 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216 else {
6217 *p++ = '\\';
6218 *p++ = 'x';
6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220 *p++ = Py_hexdigits[ch & 0x000F];
6221 }
Tim Petersced69f82003-09-16 20:30:58 +00006222 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006223 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 *p++ = '\\';
6226 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006227 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 /* Make sure that the first two digits are zero */
6236 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006237 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 *p++ = 'U';
6239 *p++ = '0';
6240 *p++ = '0';
6241 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 assert(p - PyBytes_AS_STRING(repr) > 0);
6251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252 return NULL;
6253 }
6254 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255}
6256
Alexander Belopolsky40018472011-02-26 01:02:56 +00006257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006261 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006262 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 }
6266
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006267 result = PyUnicode_AsUnicodeEscapeString(tmp);
6268 Py_DECREF(tmp);
6269 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270}
6271
6272/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274PyObject *
6275PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006276 Py_ssize_t size,
6277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006280 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 PyObject *errorHandler = NULL;
6283 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Victor Stinner62ec3312016-09-06 17:04:34 -07006285 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006286 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Escaped strings will always be longer than the resulting
6290 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 length after conversion to the true value. (But decoding error
6292 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006293 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 writer.min_length = size;
6295 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296 goto onError;
6297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 end = s + size;
6300 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 unsigned char c = (unsigned char) *s++;
6302 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006303 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 Py_ssize_t startinpos;
6305 Py_ssize_t endinpos;
6306 const char *message;
6307
6308#define WRITE_CHAR(ch) \
6309 do { \
6310 if (ch <= writer.maxchar) { \
6311 assert(writer.pos < writer.size); \
6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313 } \
6314 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315 goto onError; \
6316 } \
6317 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 if (c != '\\' || s >= end) {
6321 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006324
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 c = (unsigned char) *s++;
6326 if (c == 'u') {
6327 count = 4;
6328 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 else if (c == 'U') {
6331 count = 8;
6332 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006333 }
6334 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 assert(writer.pos < writer.size);
6336 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337 WRITE_CHAR(c);
6338 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 startinpos = s - starts - 2;
6341
6342 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343 for (ch = 0; count && s < end; ++s, --count) {
6344 c = (unsigned char)*s;
6345 ch <<= 4;
6346 if (c >= '0' && c <= '9') {
6347 ch += c - '0';
6348 }
6349 else if (c >= 'a' && c <= 'f') {
6350 ch += c - ('a' - 10);
6351 }
6352 else if (c >= 'A' && c <= 'F') {
6353 ch += c - ('A' - 10);
6354 }
6355 else {
6356 break;
6357 }
6358 }
6359 if (!count) {
6360 if (ch <= MAX_UNICODE) {
6361 WRITE_CHAR(ch);
6362 continue;
6363 }
6364 message = "\\Uxxxxxxxx out of range";
6365 }
6366
6367 endinpos = s-starts;
6368 writer.min_length = end - s + writer.pos;
6369 if (unicode_decode_call_errorhandler_writer(
6370 errors, &errorHandler,
6371 "rawunicodeescape", message,
6372 &starts, &end, &startinpos, &endinpos, &exc, &s,
6373 &writer)) {
6374 goto onError;
6375 }
6376 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377 goto onError;
6378 }
6379
6380#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 Py_XDECREF(errorHandler);
6383 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006385
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006387 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392}
6393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 int kind;
6402 void *data;
6403 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405 if (!PyUnicode_Check(unicode)) {
6406 PyErr_BadArgument();
6407 return NULL;
6408 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 kind = PyUnicode_KIND(unicode);
6413 data = PyUnicode_DATA(unicode);
6414 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (kind == PyUnicode_1BYTE_KIND) {
6416 return PyBytes_FromStringAndSize(data, len);
6417 }
Victor Stinner0e368262011-11-10 20:12:49 +01006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420 bytes, and 1 byte characters 4. */
6421 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (len > PY_SSIZE_T_MAX / expandsize) {
6424 return PyErr_NoMemory();
6425 }
6426 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427 if (repr == NULL) {
6428 return NULL;
6429 }
6430 if (len == 0) {
6431 return repr;
6432 }
6433
6434 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 for (pos = 0; pos < len; pos++) {
6436 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439 if (ch < 0x100) {
6440 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006441 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 *p++ = '\\';
6445 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006446 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452 else {
6453 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454 *p++ = '\\';
6455 *p++ = 'U';
6456 *p++ = '0';
6457 *p++ = '0';
6458 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
6464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006466
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 assert(p > PyBytes_AS_STRING(repr));
6468 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469 return NULL;
6470 }
6471 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006475PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006478 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006479 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006481 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006482 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483 Py_DECREF(tmp);
6484 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485}
6486
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487/* --- Unicode Internal Codec ------------------------------------------- */
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
6490_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006491 Py_ssize_t size,
6492 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006493{
6494 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006497 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498 const char *end;
6499 const char *reason;
6500 PyObject *errorHandler = NULL;
6501 PyObject *exc = NULL;
6502
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006504 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 1))
6506 return NULL;
6507
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006508 if (size == 0)
6509 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006510
Victor Stinner8f674cc2013-04-17 23:02:17 +02006511 _PyUnicodeWriter_Init(&writer);
6512 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006515 }
6516 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517
Victor Stinner8f674cc2013-04-17 23:02:17 +02006518 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006520 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006521 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006522 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006523 endinpos = end-starts;
6524 reason = "truncated input";
6525 goto error;
6526 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006527 /* We copy the raw representation one byte at a time because the
6528 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006529 ((char *) &uch)[0] = s[0];
6530 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006531#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006532 ((char *) &uch)[2] = s[2];
6533 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006534#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006536#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 /* We have to sanity check the raw data, otherwise doom looms for
6538 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006539 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006540 endinpos = s - starts + Py_UNICODE_SIZE;
6541 reason = "illegal code point (> 0x10FFFF)";
6542 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006543 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006544#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006545 s += Py_UNICODE_SIZE;
6546#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006548 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 Py_UNICODE uch2;
6550 ((char *) &uch2)[0] = s[0];
6551 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006552 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 {
Victor Stinner551ac952011-11-29 22:58:13 +01006554 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006555 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 }
6557 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558#endif
6559
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562 continue;
6563
6564 error:
6565 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006566 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006567 errors, &errorHandler,
6568 "unicode_internal", reason,
6569 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006570 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006571 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 }
6573
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006576 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006577
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006579 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
6582 return NULL;
6583}
6584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585/* --- Latin-1 Codec ------------------------------------------------------ */
6586
Alexander Belopolsky40018472011-02-26 01:02:56 +00006587PyObject *
6588PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006589 Py_ssize_t size,
6590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006593 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static void
6598make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 PyObject *unicode,
6601 Py_ssize_t startpos, Py_ssize_t endpos,
6602 const char *reason)
6603{
6604 if (*exceptionObject == NULL) {
6605 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006607 encoding, unicode, startpos, endpos, reason);
6608 }
6609 else {
6610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613 goto onError;
6614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615 goto onError;
6616 return;
6617 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006618 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006619 }
6620}
6621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623static void
6624raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006626 PyObject *unicode,
6627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 const char *reason)
6629{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006630 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006631 encoding, unicode, startpos, endpos, reason);
6632 if (*exceptionObject != NULL)
6633 PyCodec_StrictErrors(*exceptionObject);
6634}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635
6636/* error handling callback helper:
6637 build arguments, call the callback and check the arguments,
6638 put the result into newpos and return the replacement string, which
6639 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
6641unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 PyObject **errorHandler,
6643 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006648 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 PyObject *restuple;
6651 PyObject *resunicode;
6652
6653 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 }
6658
Benjamin Petersonbac79492012-01-14 13:34:47 -05006659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 return NULL;
6661 len = PyUnicode_GET_LENGTH(unicode);
6662
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006663 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006668 restuple = PyObject_CallFunctionObjArgs(
6669 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006673 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 Py_DECREF(restuple);
6675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006677 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 &resunicode, newpos)) {
6679 Py_DECREF(restuple);
6680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006682 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684 Py_DECREF(restuple);
6685 return NULL;
6686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006688 *newpos = len + *newpos;
6689 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 Py_DECREF(restuple);
6692 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006693 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 Py_INCREF(resunicode);
6695 Py_DECREF(restuple);
6696 return resunicode;
6697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006701 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006702 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 /* input state */
6705 Py_ssize_t pos=0, size;
6706 int kind;
6707 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 /* pointer into the output */
6709 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006710 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006712 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006714 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006715 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006716 /* output object */
6717 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718
Benjamin Petersonbac79492012-01-14 13:34:47 -05006719 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 return NULL;
6721 size = PyUnicode_GET_LENGTH(unicode);
6722 kind = PyUnicode_KIND(unicode);
6723 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 /* allocate enough for a simple encoding without
6725 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006726 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006727 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728
6729 _PyBytesWriter_Init(&writer);
6730 str = _PyBytesWriter_Alloc(&writer, size);
6731 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006735 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006738 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006740 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006744 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006747 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006749
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006750 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006752
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006753 /* Only overallocate the buffer if it's not the last write */
6754 writer.overallocate = (collend < size);
6755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006757 if (error_handler == _Py_ERROR_UNKNOWN)
6758 error_handler = get_error_handler(errors);
6759
6760 switch (error_handler) {
6761 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006762 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006764
6765 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006766 memset(str, '?', collend - collstart);
6767 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006768 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006769 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 break;
Victor Stinner50149202015-09-22 00:26:54 +02006772
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006773 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006774 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006775 writer.min_size -= (collend - collstart);
6776 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006777 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006778 if (str == NULL)
6779 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006780 pos = collend;
6781 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006782
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006783 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006784 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006785 writer.min_size -= (collend - collstart);
6786 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 unicode, collstart, collend);
6788 if (str == NULL)
6789 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 break;
Victor Stinner50149202015-09-22 00:26:54 +02006792
Victor Stinnerc3713e92015-09-29 12:32:13 +02006793 case _Py_ERROR_SURROGATEESCAPE:
6794 for (i = collstart; i < collend; ++i) {
6795 ch = PyUnicode_READ(kind, data, i);
6796 if (ch < 0xdc80 || 0xdcff < ch) {
6797 /* Not a UTF-8b surrogate */
6798 break;
6799 }
6800 *str++ = (char)(ch - 0xdc00);
6801 ++pos;
6802 }
6803 if (i >= collend)
6804 break;
6805 collstart = pos;
6806 assert(collstart != collend);
6807 /* fallback to general error handling */
6808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006810 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811 encoding, reason, unicode, &exc,
6812 collstart, collend, &newpos);
6813 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006815
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006816 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006817 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006818
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006820 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006821 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006822 PyBytes_AS_STRING(rep),
6823 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006824 if (str == NULL)
6825 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006826 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 else {
6828 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006833 if (limit == 256 ?
6834 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6835 !PyUnicode_IS_ASCII(rep))
6836 {
6837 /* Not all characters are smaller than limit */
6838 raise_encode_exception(&exc, encoding, unicode,
6839 collstart, collend, reason);
6840 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006842 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6843 str = _PyBytesWriter_WriteBytes(&writer, str,
6844 PyUnicode_DATA(rep),
6845 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006849 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850
6851 /* If overallocation was disabled, ensure that it was the last
6852 write. Otherwise, we missed an optimization */
6853 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006854 }
6855 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006856
Victor Stinner50149202015-09-22 00:26:54 +02006857 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006859 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006860
6861 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006864 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006865 Py_XDECREF(exc);
6866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867}
6868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyObject *
6871PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006872 Py_ssize_t size,
6873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006876 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 if (unicode == NULL)
6878 return NULL;
6879 result = unicode_encode_ucs1(unicode, errors, 256);
6880 Py_DECREF(unicode);
6881 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882}
6883
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006885_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 PyErr_BadArgument();
6889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891 if (PyUnicode_READY(unicode) == -1)
6892 return NULL;
6893 /* Fast path: if it is a one-byte string, construct
6894 bytes object directly. */
6895 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897 PyUnicode_GET_LENGTH(unicode));
6898 /* Non-Latin-1 characters present. Defer to above function to
6899 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901}
6902
6903PyObject*
6904PyUnicode_AsLatin1String(PyObject *unicode)
6905{
6906 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
6909/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_DecodeASCII(const char *s,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006917 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006918 int kind;
6919 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920 Py_ssize_t startinpos;
6921 Py_ssize_t endinpos;
6922 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006924 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006926 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006929 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006932 if (size == 1 && (unsigned char)s[0] < 128)
6933 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006934
Victor Stinner8f674cc2013-04-17 23:02:17 +02006935 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006936 writer.min_length = size;
6937 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006938 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006941 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006942 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006943 writer.pos = outpos;
6944 if (writer.pos == size)
6945 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006946
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006947 s += writer.pos;
6948 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006949 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006950 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006952 PyUnicode_WRITE(kind, data, writer.pos, c);
6953 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006957
6958 /* byte outsize range 0x00..0x7f: call the error handler */
6959
6960 if (error_handler == _Py_ERROR_UNKNOWN)
6961 error_handler = get_error_handler(errors);
6962
6963 switch (error_handler)
6964 {
6965 case _Py_ERROR_REPLACE:
6966 case _Py_ERROR_SURROGATEESCAPE:
6967 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006968 but we may switch to UCS2 at the first write */
6969 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6970 goto onError;
6971 kind = writer.kind;
6972 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973
6974 if (error_handler == _Py_ERROR_REPLACE)
6975 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6976 else
6977 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6978 writer.pos++;
6979 ++s;
6980 break;
6981
6982 case _Py_ERROR_IGNORE:
6983 ++s;
6984 break;
6985
6986 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 startinpos = s-starts;
6988 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 "ascii", "ordinal not in range(128)",
6992 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006993 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006995 kind = writer.kind;
6996 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006999 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007002
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007005 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return NULL;
7008}
7009
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011PyObject *
7012PyUnicode_EncodeASCII(const Py_UNICODE *p,
7013 Py_ssize_t size,
7014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007017 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 if (unicode == NULL)
7019 return NULL;
7020 result = unicode_encode_ucs1(unicode, errors, 128);
7021 Py_DECREF(unicode);
7022 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
7028 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyErr_BadArgument();
7030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007032 if (PyUnicode_READY(unicode) == -1)
7033 return NULL;
7034 /* Fast path: if it is an ASCII-only string, construct bytes object
7035 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007036 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040}
7041
7042PyObject *
7043PyUnicode_AsASCIIString(PyObject *unicode)
7044{
7045 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Steve Dowercc16be82016-09-08 10:35:16 -07007048#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007049
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007050/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007051
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007052#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053#define NEED_RETRY
7054#endif
7055
Victor Stinner3a50e702011-10-18 21:21:00 +02007056#ifndef WC_ERR_INVALID_CHARS
7057# define WC_ERR_INVALID_CHARS 0x0080
7058#endif
7059
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007060static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007061code_page_name(UINT code_page, PyObject **obj)
7062{
7063 *obj = NULL;
7064 if (code_page == CP_ACP)
7065 return "mbcs";
7066 if (code_page == CP_UTF7)
7067 return "CP_UTF7";
7068 if (code_page == CP_UTF8)
7069 return "CP_UTF8";
7070
7071 *obj = PyBytes_FromFormat("cp%u", code_page);
7072 if (*obj == NULL)
7073 return NULL;
7074 return PyBytes_AS_STRING(*obj);
7075}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077static DWORD
7078decode_code_page_flags(UINT code_page)
7079{
7080 if (code_page == CP_UTF7) {
7081 /* The CP_UTF7 decoder only supports flags=0 */
7082 return 0;
7083 }
7084 else
7085 return MB_ERR_INVALID_CHARS;
7086}
7087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 * Decode a byte string from a Windows code page into unicode object in strict
7090 * mode.
7091 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007092 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7093 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007096decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 const char *in,
7099 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100{
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007102 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
7105 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 assert(insize > 0);
7107 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7108 if (outsize <= 0)
7109 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110
7111 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007113 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 if (*v == NULL)
7116 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118 }
7119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007122 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125 }
7126
7127 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7129 if (outsize <= 0)
7130 goto error;
7131 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133error:
7134 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7135 return -2;
7136 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007137 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138}
7139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140/*
7141 * Decode a byte string from a code page into unicode object with an error
7142 * handler.
7143 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007144 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 * UnicodeDecodeError exception and returns -1 on error.
7146 */
7147static int
7148decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007149 PyObject **v,
7150 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007151 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007152{
7153 const char *startin = in;
7154 const char *endin = in + size;
7155 const DWORD flags = decode_code_page_flags(code_page);
7156 /* Ideally, we should get reason from FormatMessage. This is the Windows
7157 2000 English version of the message. */
7158 const char *reason = "No mapping for the Unicode character exists "
7159 "in the target code page.";
7160 /* each step cannot decode more than 1 character, but a character can be
7161 represented as a surrogate pair */
7162 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007163 int insize;
7164 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 PyObject *errorHandler = NULL;
7166 PyObject *exc = NULL;
7167 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007168 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 DWORD err;
7170 int ret = -1;
7171
7172 assert(size > 0);
7173
7174 encoding = code_page_name(code_page, &encoding_obj);
7175 if (encoding == NULL)
7176 return -1;
7177
Victor Stinner7d00cc12014-03-17 23:08:06 +01007178 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7180 UnicodeDecodeError. */
7181 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7182 if (exc != NULL) {
7183 PyCodec_StrictErrors(exc);
7184 Py_CLEAR(exc);
7185 }
7186 goto error;
7187 }
7188
7189 if (*v == NULL) {
7190 /* Create unicode object */
7191 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7192 PyErr_NoMemory();
7193 goto error;
7194 }
Victor Stinnerab595942011-12-17 04:59:06 +01007195 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007196 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 if (*v == NULL)
7198 goto error;
7199 startout = PyUnicode_AS_UNICODE(*v);
7200 }
7201 else {
7202 /* Extend unicode object */
7203 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7204 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007208 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 goto error;
7210 startout = PyUnicode_AS_UNICODE(*v) + n;
7211 }
7212
7213 /* Decode the byte string character per character */
7214 out = startout;
7215 while (in < endin)
7216 {
7217 /* Decode a character */
7218 insize = 1;
7219 do
7220 {
7221 outsize = MultiByteToWideChar(code_page, flags,
7222 in, insize,
7223 buffer, Py_ARRAY_LENGTH(buffer));
7224 if (outsize > 0)
7225 break;
7226 err = GetLastError();
7227 if (err != ERROR_NO_UNICODE_TRANSLATION
7228 && err != ERROR_INSUFFICIENT_BUFFER)
7229 {
7230 PyErr_SetFromWindowsErr(0);
7231 goto error;
7232 }
7233 insize++;
7234 }
7235 /* 4=maximum length of a UTF-8 sequence */
7236 while (insize <= 4 && (in + insize) <= endin);
7237
7238 if (outsize <= 0) {
7239 Py_ssize_t startinpos, endinpos, outpos;
7240
Victor Stinner7d00cc12014-03-17 23:08:06 +01007241 /* last character in partial decode? */
7242 if (in + insize >= endin && !final)
7243 break;
7244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 startinpos = in - startin;
7246 endinpos = startinpos + 1;
7247 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007248 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 errors, &errorHandler,
7250 encoding, reason,
7251 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007252 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 {
7254 goto error;
7255 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007256 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 }
7258 else {
7259 in += insize;
7260 memcpy(out, buffer, outsize * sizeof(wchar_t));
7261 out += outsize;
7262 }
7263 }
7264
7265 /* write a NUL character at the end */
7266 *out = 0;
7267
7268 /* Extend unicode object */
7269 outsize = out - startout;
7270 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007271 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007273 /* (in - startin) <= size and size is an int */
7274 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007275
7276error:
7277 Py_XDECREF(encoding_obj);
7278 Py_XDECREF(errorHandler);
7279 Py_XDECREF(exc);
7280 return ret;
7281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283static PyObject *
7284decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 const char *s, Py_ssize_t size,
7286 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287{
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 PyObject *v = NULL;
7289 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 if (code_page < 0) {
7292 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7293 return NULL;
7294 }
7295
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 do
7300 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 if (size > INT_MAX) {
7303 chunk_size = INT_MAX;
7304 final = 0;
7305 done = 0;
7306 }
7307 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 {
7310 chunk_size = (int)size;
7311 final = (consumed == NULL);
7312 done = 1;
7313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
Victor Stinner76a31a62011-11-04 00:05:13 +01007315 if (chunk_size == 0 && done) {
7316 if (v != NULL)
7317 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007318 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 converted = decode_code_page_strict(code_page, &v,
7322 s, chunk_size);
7323 if (converted == -2)
7324 converted = decode_code_page_errors(code_page, &v,
7325 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007326 errors, final);
7327 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007328
7329 if (converted < 0) {
7330 Py_XDECREF(v);
7331 return NULL;
7332 }
7333
7334 if (consumed)
7335 *consumed += converted;
7336
7337 s += converted;
7338 size -= converted;
7339 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007340
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007341 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342}
7343
Alexander Belopolsky40018472011-02-26 01:02:56 +00007344PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007345PyUnicode_DecodeCodePageStateful(int code_page,
7346 const char *s,
7347 Py_ssize_t size,
7348 const char *errors,
7349 Py_ssize_t *consumed)
7350{
7351 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7352}
7353
7354PyObject *
7355PyUnicode_DecodeMBCSStateful(const char *s,
7356 Py_ssize_t size,
7357 const char *errors,
7358 Py_ssize_t *consumed)
7359{
7360 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7361}
7362
7363PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007364PyUnicode_DecodeMBCS(const char *s,
7365 Py_ssize_t size,
7366 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7369}
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371static DWORD
7372encode_code_page_flags(UINT code_page, const char *errors)
7373{
7374 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007375 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 }
7377 else if (code_page == CP_UTF7) {
7378 /* CP_UTF7 only supports flags=0 */
7379 return 0;
7380 }
7381 else {
7382 if (errors != NULL && strcmp(errors, "replace") == 0)
7383 return 0;
7384 else
7385 return WC_NO_BEST_FIT_CHARS;
7386 }
7387}
7388
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 * Encode a Unicode string to a Windows code page into a byte string in strict
7391 * mode.
7392 *
7393 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007394 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007396static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007397encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400{
Victor Stinner554f3f02010-06-16 23:33:54 +00007401 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 BOOL *pusedDefaultChar = &usedDefaultChar;
7403 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007404 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 const DWORD flags = encode_code_page_flags(code_page, NULL);
7407 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 /* Create a substring so that we can get the UTF-16 representation
7409 of just the slice under consideration. */
7410 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411
Martin v. Löwis3d325192011-11-04 18:23:06 +01007412 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007415 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007417 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007418
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 substring = PyUnicode_Substring(unicode, offset, offset+len);
7420 if (substring == NULL)
7421 return -1;
7422 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7423 if (p == NULL) {
7424 Py_DECREF(substring);
7425 return -1;
7426 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007427 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007429 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007431 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 NULL, 0,
7433 NULL, pusedDefaultChar);
7434 if (outsize <= 0)
7435 goto error;
7436 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 if (pusedDefaultChar && *pusedDefaultChar) {
7438 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 if (*outbytes == NULL) {
7446 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450 }
7451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const Py_ssize_t n = PyBytes_Size(*outbytes);
7454 if (outsize > PY_SSIZE_T_MAX - n) {
7455 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464 }
7465
7466 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007468 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 out, outsize,
7470 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 if (outsize <= 0)
7473 goto error;
7474 if (pusedDefaultChar && *pusedDefaultChar)
7475 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481 return -2;
7482 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007483 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007484}
7485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007487 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 * error handler.
7489 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007490 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 * -1 on other error.
7492 */
7493static int
7494encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007497{
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 Py_ssize_t pos = unicode_offset;
7500 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 /* Ideally, we should get reason from FormatMessage. This is the Windows
7502 2000 English version of the message. */
7503 const char *reason = "invalid character";
7504 /* 4=maximum length of a UTF-8 sequence */
7505 char buffer[4];
7506 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7507 Py_ssize_t outsize;
7508 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 PyObject *errorHandler = NULL;
7510 PyObject *exc = NULL;
7511 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007512 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 PyObject *rep;
7515 int ret = -1;
7516
7517 assert(insize > 0);
7518
7519 encoding = code_page_name(code_page, &encoding_obj);
7520 if (encoding == NULL)
7521 return -1;
7522
7523 if (errors == NULL || strcmp(errors, "strict") == 0) {
7524 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7525 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007526 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (exc != NULL) {
7528 PyCodec_StrictErrors(exc);
7529 Py_DECREF(exc);
7530 }
7531 Py_XDECREF(encoding_obj);
7532 return -1;
7533 }
7534
7535 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7536 pusedDefaultChar = &usedDefaultChar;
7537 else
7538 pusedDefaultChar = NULL;
7539
7540 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7541 PyErr_NoMemory();
7542 goto error;
7543 }
7544 outsize = insize * Py_ARRAY_LENGTH(buffer);
7545
7546 if (*outbytes == NULL) {
7547 /* Create string object */
7548 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7549 if (*outbytes == NULL)
7550 goto error;
7551 out = PyBytes_AS_STRING(*outbytes);
7552 }
7553 else {
7554 /* Extend string object */
7555 Py_ssize_t n = PyBytes_Size(*outbytes);
7556 if (n > PY_SSIZE_T_MAX - outsize) {
7557 PyErr_NoMemory();
7558 goto error;
7559 }
7560 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7561 goto error;
7562 out = PyBytes_AS_STRING(*outbytes) + n;
7563 }
7564
7565 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7569 wchar_t chars[2];
7570 int charsize;
7571 if (ch < 0x10000) {
7572 chars[0] = (wchar_t)ch;
7573 charsize = 1;
7574 }
7575 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007576 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7577 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 charsize = 2;
7579 }
7580
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 buffer, Py_ARRAY_LENGTH(buffer),
7584 NULL, pusedDefaultChar);
7585 if (outsize > 0) {
7586 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7587 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 memcpy(out, buffer, outsize);
7590 out += outsize;
7591 continue;
7592 }
7593 }
7594 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7595 PyErr_SetFromWindowsErr(0);
7596 goto error;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 rep = unicode_encode_call_errorhandler(
7600 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007601 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 if (rep == NULL)
7604 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606
7607 if (PyBytes_Check(rep)) {
7608 outsize = PyBytes_GET_SIZE(rep);
7609 if (outsize != 1) {
7610 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613 Py_DECREF(rep);
7614 goto error;
7615 }
7616 out = PyBytes_AS_STRING(*outbytes) + offset;
7617 }
7618 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7619 out += outsize;
7620 }
7621 else {
7622 Py_ssize_t i;
7623 enum PyUnicode_Kind kind;
7624 void *data;
7625
Benjamin Petersonbac79492012-01-14 13:34:47 -05007626 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 Py_DECREF(rep);
7628 goto error;
7629 }
7630
7631 outsize = PyUnicode_GET_LENGTH(rep);
7632 if (outsize != 1) {
7633 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636 Py_DECREF(rep);
7637 goto error;
7638 }
7639 out = PyBytes_AS_STRING(*outbytes) + offset;
7640 }
7641 kind = PyUnicode_KIND(rep);
7642 data = PyUnicode_DATA(rep);
7643 for (i=0; i < outsize; i++) {
7644 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7645 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007646 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007647 encoding, unicode,
7648 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 "unable to encode error handler result to ASCII");
7650 Py_DECREF(rep);
7651 goto error;
7652 }
7653 *out = (unsigned char)ch;
7654 out++;
7655 }
7656 }
7657 Py_DECREF(rep);
7658 }
7659 /* write a NUL byte */
7660 *out = 0;
7661 outsize = out - PyBytes_AS_STRING(*outbytes);
7662 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7663 if (_PyBytes_Resize(outbytes, outsize) < 0)
7664 goto error;
7665 ret = 0;
7666
7667error:
7668 Py_XDECREF(encoding_obj);
7669 Py_XDECREF(errorHandler);
7670 Py_XDECREF(exc);
7671 return ret;
7672}
7673
Victor Stinner3a50e702011-10-18 21:21:00 +02007674static PyObject *
7675encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 const char *errors)
7678{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007681 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007682 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007683
Victor Stinner29dacf22015-01-26 16:41:32 +01007684 if (!PyUnicode_Check(unicode)) {
7685 PyErr_BadArgument();
7686 return NULL;
7687 }
7688
Benjamin Petersonbac79492012-01-14 13:34:47 -05007689 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 return NULL;
7691 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 if (code_page < 0) {
7694 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7695 return NULL;
7696 }
7697
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 return PyBytes_FromStringAndSize(NULL, 0);
7700
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 offset = 0;
7702 do
7703 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 chunks. */
7707 if (len > INT_MAX/2) {
7708 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 done = 0;
7710 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007711 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007713 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 done = 1;
7716 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717
Victor Stinner76a31a62011-11-04 00:05:13 +01007718 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007720 errors);
7721 if (ret == -2)
7722 ret = encode_code_page_errors(code_page, &outbytes,
7723 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007725 if (ret < 0) {
7726 Py_XDECREF(outbytes);
7727 return NULL;
7728 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007733
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 return outbytes;
7735}
7736
7737PyObject *
7738PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7739 Py_ssize_t size,
7740 const char *errors)
7741{
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007743 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 if (unicode == NULL)
7745 return NULL;
7746 res = encode_code_page(CP_ACP, unicode, errors);
7747 Py_DECREF(unicode);
7748 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007749}
7750
7751PyObject *
7752PyUnicode_EncodeCodePage(int code_page,
7753 PyObject *unicode,
7754 const char *errors)
7755{
Victor Stinner7581cef2011-11-03 22:32:33 +01007756 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007757}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007758
Alexander Belopolsky40018472011-02-26 01:02:56 +00007759PyObject *
7760PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007761{
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007763}
7764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#undef NEED_RETRY
7766
Steve Dowercc16be82016-09-08 10:35:16 -07007767#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007768
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769/* --- Character Mapping Codec -------------------------------------------- */
7770
Victor Stinnerfb161b12013-04-18 01:44:27 +02007771static int
7772charmap_decode_string(const char *s,
7773 Py_ssize_t size,
7774 PyObject *mapping,
7775 const char *errors,
7776 _PyUnicodeWriter *writer)
7777{
7778 const char *starts = s;
7779 const char *e;
7780 Py_ssize_t startinpos, endinpos;
7781 PyObject *errorHandler = NULL, *exc = NULL;
7782 Py_ssize_t maplen;
7783 enum PyUnicode_Kind mapkind;
7784 void *mapdata;
7785 Py_UCS4 x;
7786 unsigned char ch;
7787
7788 if (PyUnicode_READY(mapping) == -1)
7789 return -1;
7790
7791 maplen = PyUnicode_GET_LENGTH(mapping);
7792 mapdata = PyUnicode_DATA(mapping);
7793 mapkind = PyUnicode_KIND(mapping);
7794
7795 e = s + size;
7796
7797 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7798 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7799 * is disabled in encoding aliases, latin1 is preferred because
7800 * its implementation is faster. */
7801 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7802 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803 Py_UCS4 maxchar = writer->maxchar;
7804
7805 assert (writer->kind == PyUnicode_1BYTE_KIND);
7806 while (s < e) {
7807 ch = *s;
7808 x = mapdata_ucs1[ch];
7809 if (x > maxchar) {
7810 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7811 goto onError;
7812 maxchar = writer->maxchar;
7813 outdata = (Py_UCS1 *)writer->data;
7814 }
7815 outdata[writer->pos] = x;
7816 writer->pos++;
7817 ++s;
7818 }
7819 return 0;
7820 }
7821
7822 while (s < e) {
7823 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7824 enum PyUnicode_Kind outkind = writer->kind;
7825 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7826 if (outkind == PyUnicode_1BYTE_KIND) {
7827 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828 Py_UCS4 maxchar = writer->maxchar;
7829 while (s < e) {
7830 ch = *s;
7831 x = mapdata_ucs2[ch];
7832 if (x > maxchar)
7833 goto Error;
7834 outdata[writer->pos] = x;
7835 writer->pos++;
7836 ++s;
7837 }
7838 break;
7839 }
7840 else if (outkind == PyUnicode_2BYTE_KIND) {
7841 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7842 while (s < e) {
7843 ch = *s;
7844 x = mapdata_ucs2[ch];
7845 if (x == 0xFFFE)
7846 goto Error;
7847 outdata[writer->pos] = x;
7848 writer->pos++;
7849 ++s;
7850 }
7851 break;
7852 }
7853 }
7854 ch = *s;
7855
7856 if (ch < maplen)
7857 x = PyUnicode_READ(mapkind, mapdata, ch);
7858 else
7859 x = 0xfffe; /* invalid value */
7860Error:
7861 if (x == 0xfffe)
7862 {
7863 /* undefined mapping */
7864 startinpos = s-starts;
7865 endinpos = startinpos+1;
7866 if (unicode_decode_call_errorhandler_writer(
7867 errors, &errorHandler,
7868 "charmap", "character maps to <undefined>",
7869 &starts, &e, &startinpos, &endinpos, &exc, &s,
7870 writer)) {
7871 goto onError;
7872 }
7873 continue;
7874 }
7875
7876 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7877 goto onError;
7878 ++s;
7879 }
7880 Py_XDECREF(errorHandler);
7881 Py_XDECREF(exc);
7882 return 0;
7883
7884onError:
7885 Py_XDECREF(errorHandler);
7886 Py_XDECREF(exc);
7887 return -1;
7888}
7889
7890static int
7891charmap_decode_mapping(const char *s,
7892 Py_ssize_t size,
7893 PyObject *mapping,
7894 const char *errors,
7895 _PyUnicodeWriter *writer)
7896{
7897 const char *starts = s;
7898 const char *e;
7899 Py_ssize_t startinpos, endinpos;
7900 PyObject *errorHandler = NULL, *exc = NULL;
7901 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007902 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007903
7904 e = s + size;
7905
7906 while (s < e) {
7907 ch = *s;
7908
7909 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7910 key = PyLong_FromLong((long)ch);
7911 if (key == NULL)
7912 goto onError;
7913
7914 item = PyObject_GetItem(mapping, key);
7915 Py_DECREF(key);
7916 if (item == NULL) {
7917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7918 /* No mapping found means: mapping is undefined. */
7919 PyErr_Clear();
7920 goto Undefined;
7921 } else
7922 goto onError;
7923 }
7924
7925 /* Apply mapping */
7926 if (item == Py_None)
7927 goto Undefined;
7928 if (PyLong_Check(item)) {
7929 long value = PyLong_AS_LONG(item);
7930 if (value == 0xFFFE)
7931 goto Undefined;
7932 if (value < 0 || value > MAX_UNICODE) {
7933 PyErr_Format(PyExc_TypeError,
7934 "character mapping must be in range(0x%lx)",
7935 (unsigned long)MAX_UNICODE + 1);
7936 goto onError;
7937 }
7938
7939 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940 goto onError;
7941 }
7942 else if (PyUnicode_Check(item)) {
7943 if (PyUnicode_READY(item) == -1)
7944 goto onError;
7945 if (PyUnicode_GET_LENGTH(item) == 1) {
7946 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7947 if (value == 0xFFFE)
7948 goto Undefined;
7949 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7950 goto onError;
7951 }
7952 else {
7953 writer->overallocate = 1;
7954 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7955 goto onError;
7956 }
7957 }
7958 else {
7959 /* wrong return value */
7960 PyErr_SetString(PyExc_TypeError,
7961 "character mapping must return integer, None or str");
7962 goto onError;
7963 }
7964 Py_CLEAR(item);
7965 ++s;
7966 continue;
7967
7968Undefined:
7969 /* undefined mapping */
7970 Py_CLEAR(item);
7971 startinpos = s-starts;
7972 endinpos = startinpos+1;
7973 if (unicode_decode_call_errorhandler_writer(
7974 errors, &errorHandler,
7975 "charmap", "character maps to <undefined>",
7976 &starts, &e, &startinpos, &endinpos, &exc, &s,
7977 writer)) {
7978 goto onError;
7979 }
7980 }
7981 Py_XDECREF(errorHandler);
7982 Py_XDECREF(exc);
7983 return 0;
7984
7985onError:
7986 Py_XDECREF(item);
7987 Py_XDECREF(errorHandler);
7988 Py_XDECREF(exc);
7989 return -1;
7990}
7991
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992PyObject *
7993PyUnicode_DecodeCharmap(const char *s,
7994 Py_ssize_t size,
7995 PyObject *mapping,
7996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007998 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007999
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 /* Default to Latin-1 */
8001 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008005 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008006 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008007 writer.min_length = size;
8008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008010
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008011 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008012 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8013 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008014 }
8015 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008016 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008020
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 return NULL;
8024}
8025
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026/* Charmap encoding: the lookup table */
8027
Alexander Belopolsky40018472011-02-26 01:02:56 +00008028struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 PyObject_HEAD
8030 unsigned char level1[32];
8031 int count2, count3;
8032 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033};
8034
8035static PyObject*
8036encoding_map_size(PyObject *obj, PyObject* args)
8037{
8038 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041}
8042
8043static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 PyDoc_STR("Return the size (in bytes) of this object") },
8046 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047};
8048
8049static void
8050encoding_map_dealloc(PyObject* o)
8051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053}
8054
8055static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 "EncodingMap", /*tp_name*/
8058 sizeof(struct encoding_map), /*tp_basicsize*/
8059 0, /*tp_itemsize*/
8060 /* methods */
8061 encoding_map_dealloc, /*tp_dealloc*/
8062 0, /*tp_print*/
8063 0, /*tp_getattr*/
8064 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008065 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 0, /*tp_repr*/
8067 0, /*tp_as_number*/
8068 0, /*tp_as_sequence*/
8069 0, /*tp_as_mapping*/
8070 0, /*tp_hash*/
8071 0, /*tp_call*/
8072 0, /*tp_str*/
8073 0, /*tp_getattro*/
8074 0, /*tp_setattro*/
8075 0, /*tp_as_buffer*/
8076 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8077 0, /*tp_doc*/
8078 0, /*tp_traverse*/
8079 0, /*tp_clear*/
8080 0, /*tp_richcompare*/
8081 0, /*tp_weaklistoffset*/
8082 0, /*tp_iter*/
8083 0, /*tp_iternext*/
8084 encoding_map_methods, /*tp_methods*/
8085 0, /*tp_members*/
8086 0, /*tp_getset*/
8087 0, /*tp_base*/
8088 0, /*tp_dict*/
8089 0, /*tp_descr_get*/
8090 0, /*tp_descr_set*/
8091 0, /*tp_dictoffset*/
8092 0, /*tp_init*/
8093 0, /*tp_alloc*/
8094 0, /*tp_new*/
8095 0, /*tp_free*/
8096 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097};
8098
8099PyObject*
8100PyUnicode_BuildEncodingMap(PyObject* string)
8101{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 PyObject *result;
8103 struct encoding_map *mresult;
8104 int i;
8105 int need_dict = 0;
8106 unsigned char level1[32];
8107 unsigned char level2[512];
8108 unsigned char *mlevel1, *mlevel2, *mlevel3;
8109 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 int kind;
8111 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008112 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008115 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 PyErr_BadArgument();
8117 return NULL;
8118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 kind = PyUnicode_KIND(string);
8120 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008121 length = PyUnicode_GET_LENGTH(string);
8122 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 memset(level1, 0xFF, sizeof level1);
8124 memset(level2, 0xFF, sizeof level2);
8125
8126 /* If there isn't a one-to-one mapping of NULL to \0,
8127 or if there are non-BMP characters, we need to use
8128 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008130 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008131 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 ch = PyUnicode_READ(kind, data, i);
8134 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 need_dict = 1;
8136 break;
8137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 /* unmapped character */
8140 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 l1 = ch >> 11;
8142 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 if (level1[l1] == 0xFF)
8144 level1[l1] = count2++;
8145 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 }
8148
8149 if (count2 >= 0xFF || count3 >= 0xFF)
8150 need_dict = 1;
8151
8152 if (need_dict) {
8153 PyObject *result = PyDict_New();
8154 PyObject *key, *value;
8155 if (!result)
8156 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008157 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008159 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (!key || !value)
8161 goto failed1;
8162 if (PyDict_SetItem(result, key, value) == -1)
8163 goto failed1;
8164 Py_DECREF(key);
8165 Py_DECREF(value);
8166 }
8167 return result;
8168 failed1:
8169 Py_XDECREF(key);
8170 Py_XDECREF(value);
8171 Py_DECREF(result);
8172 return NULL;
8173 }
8174
8175 /* Create a three-level trie */
8176 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8177 16*count2 + 128*count3 - 1);
8178 if (!result)
8179 return PyErr_NoMemory();
8180 PyObject_Init(result, &EncodingMapType);
8181 mresult = (struct encoding_map*)result;
8182 mresult->count2 = count2;
8183 mresult->count3 = count3;
8184 mlevel1 = mresult->level1;
8185 mlevel2 = mresult->level23;
8186 mlevel3 = mresult->level23 + 16*count2;
8187 memcpy(mlevel1, level1, 32);
8188 memset(mlevel2, 0xFF, 16*count2);
8189 memset(mlevel3, 0, 128*count3);
8190 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008191 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008193 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8194 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 /* unmapped character */
8196 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008197 o1 = ch>>11;
8198 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008199 i2 = 16*mlevel1[o1] + o2;
8200 if (mlevel2[i2] == 0xFF)
8201 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008202 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203 i3 = 128*mlevel2[i2] + o3;
8204 mlevel3[i3] = i;
8205 }
8206 return result;
8207}
8208
8209static int
Victor Stinner22168992011-11-20 17:09:18 +01008210encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211{
8212 struct encoding_map *map = (struct encoding_map*)mapping;
8213 int l1 = c>>11;
8214 int l2 = (c>>7) & 0xF;
8215 int l3 = c & 0x7F;
8216 int i;
8217
Victor Stinner22168992011-11-20 17:09:18 +01008218 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 if (c == 0)
8221 return 0;
8222 /* level 1*/
8223 i = map->level1[l1];
8224 if (i == 0xFF) {
8225 return -1;
8226 }
8227 /* level 2*/
8228 i = map->level23[16*i+l2];
8229 if (i == 0xFF) {
8230 return -1;
8231 }
8232 /* level 3 */
8233 i = map->level23[16*map->count2 + 128*i + l3];
8234 if (i == 0) {
8235 return -1;
8236 }
8237 return i;
8238}
8239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240/* Lookup the character ch in the mapping. If the character
8241 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008242 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008244charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Christian Heimes217cfd12007-12-02 14:31:20 +00008246 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *x;
8248
8249 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 x = PyObject_GetItem(mapping, w);
8252 Py_DECREF(w);
8253 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255 /* No mapping found means: mapping is undefined. */
8256 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008257 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 } else
8259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008261 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008263 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 long value = PyLong_AS_LONG(x);
8265 if (value < 0 || value > 255) {
8266 PyErr_SetString(PyExc_TypeError,
8267 "character mapping must be in range(256)");
8268 Py_DECREF(x);
8269 return NULL;
8270 }
8271 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 /* wrong return value */
8277 PyErr_Format(PyExc_TypeError,
8278 "character mapping must return integer, bytes or None, not %.400s",
8279 x->ob_type->tp_name);
8280 Py_DECREF(x);
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
8283}
8284
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008286charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8289 /* exponentially overallocate to minimize reallocations */
8290 if (requiredsize < 2*outsize)
8291 requiredsize = 2*outsize;
8292 if (_PyBytes_Resize(outobj, requiredsize))
8293 return -1;
8294 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295}
8296
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008301 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 space is available. Return a new reference to the object that
8303 was put in the output buffer, or Py_None, if the mapping was undefined
8304 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008305 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008307charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310 PyObject *rep;
8311 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008312 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313
Christian Heimes90aa7642007-12-19 02:45:37 +00008314 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 if (res == -1)
8318 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 if (outsize<requiredsize)
8320 if (charmapencode_resize(outobj, outpos, requiredsize))
8321 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008322 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 outstart[(*outpos)++] = (char)res;
8324 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 }
8326
8327 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 Py_DECREF(rep);
8332 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 if (PyLong_Check(rep)) {
8335 Py_ssize_t requiredsize = *outpos+1;
8336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8338 Py_DECREF(rep);
8339 return enc_EXCEPTION;
8340 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 else {
8345 const char *repchars = PyBytes_AS_STRING(rep);
8346 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8347 Py_ssize_t requiredsize = *outpos+repsize;
8348 if (outsize<requiredsize)
8349 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8350 Py_DECREF(rep);
8351 return enc_EXCEPTION;
8352 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008353 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 memcpy(outstart + *outpos, repchars, repsize);
8355 *outpos += repsize;
8356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 Py_DECREF(rep);
8359 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360}
8361
8362/* handle an error in PyUnicode_EncodeCharmap
8363 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008364static int
8365charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008368 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008369 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
8371 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008374 enum PyUnicode_Kind kind;
8375 void *data;
8376 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 Py_ssize_t collstartpos = *inpos;
8379 Py_ssize_t collendpos = *inpos+1;
8380 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 char *encoding = "charmap";
8382 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008385 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386
Benjamin Petersonbac79492012-01-14 13:34:47 -05008387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 return -1;
8389 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 /* find all unencodable characters */
8391 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008392 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008393 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008395 val = encoding_map_lookup(ch, mapping);
8396 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 break;
8398 ++collendpos;
8399 continue;
8400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8403 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 if (rep==NULL)
8405 return -1;
8406 else if (rep!=Py_None) {
8407 Py_DECREF(rep);
8408 break;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413 /* cache callback name lookup
8414 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008415 if (*error_handler == _Py_ERROR_UNKNOWN)
8416 *error_handler = get_error_handler(errors);
8417
8418 switch (*error_handler) {
8419 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008420 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008422
8423 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 x = charmapencode_output('?', mapping, res, respos);
8426 if (x==enc_EXCEPTION) {
8427 return -1;
8428 }
8429 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008430 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -1;
8432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 }
8434 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008435 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 *inpos = collendpos;
8437 break;
Victor Stinner50149202015-09-22 00:26:54 +02008438
8439 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 /* generate replacement (temporarily (mis)uses p) */
8441 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 char buffer[2+29+1+1];
8443 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 for (cp = buffer; *cp; ++cp) {
8446 x = charmapencode_output(*cp, mapping, res, respos);
8447 if (x==enc_EXCEPTION)
8448 return -1;
8449 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008450 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 }
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 *inpos = collendpos;
8456 break;
Victor Stinner50149202015-09-22 00:26:54 +02008457
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 default:
Victor Stinner50149202015-09-22 00:26:54 +02008459 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008464 if (PyBytes_Check(repunicode)) {
8465 /* Directly copy bytes result to output. */
8466 Py_ssize_t outsize = PyBytes_Size(*res);
8467 Py_ssize_t requiredsize;
8468 repsize = PyBytes_Size(repunicode);
8469 requiredsize = *respos + repsize;
8470 if (requiredsize > outsize)
8471 /* Make room for all additional bytes. */
8472 if (charmapencode_resize(res, respos, requiredsize)) {
8473 Py_DECREF(repunicode);
8474 return -1;
8475 }
8476 memcpy(PyBytes_AsString(*res) + *respos,
8477 PyBytes_AsString(repunicode), repsize);
8478 *respos += repsize;
8479 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008480 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008481 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008484 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008485 Py_DECREF(repunicode);
8486 return -1;
8487 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008488 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008489 data = PyUnicode_DATA(repunicode);
8490 kind = PyUnicode_KIND(repunicode);
8491 for (index = 0; index < repsize; index++) {
8492 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8493 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008495 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 return -1;
8497 }
8498 else if (x==enc_FAILED) {
8499 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008500 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
8502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 }
8504 *inpos = newpos;
8505 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 }
8507 return 0;
8508}
8509
Alexander Belopolsky40018472011-02-26 01:02:56 +00008510PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511_PyUnicode_EncodeCharmap(PyObject *unicode,
8512 PyObject *mapping,
8513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 /* output object */
8516 PyObject *res = NULL;
8517 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008518 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008521 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008522 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008524 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008525 void *data;
8526 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527
Benjamin Petersonbac79492012-01-14 13:34:47 -05008528 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529 return NULL;
8530 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008531 data = PyUnicode_DATA(unicode);
8532 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 /* Default to Latin-1 */
8535 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* allocate enough for a simple encoding without
8539 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008540 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 if (res == NULL)
8542 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008543 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008547 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 if (x==enc_EXCEPTION) /* error */
8551 goto onError;
8552 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008555 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 &res, &respos)) {
8557 goto onError;
8558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 else
8561 /* done with this character => adjust input position */
8562 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008566 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008567 if (_PyBytes_Resize(&res, respos) < 0)
8568 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008571 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 return res;
8573
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 Py_XDECREF(res);
8576 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008577 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 return NULL;
8579}
8580
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008581/* Deprecated */
8582PyObject *
8583PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8584 Py_ssize_t size,
8585 PyObject *mapping,
8586 const char *errors)
8587{
8588 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008589 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 if (unicode == NULL)
8591 return NULL;
8592 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8593 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008594 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008595}
8596
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597PyObject *
8598PyUnicode_AsCharmapString(PyObject *unicode,
8599 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600{
8601 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 PyErr_BadArgument();
8603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008605 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609static void
8610make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008612 Py_ssize_t startpos, Py_ssize_t endpos,
8613 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 *exceptionObject = _PyUnicodeTranslateError_Create(
8617 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
8619 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8621 goto onError;
8622 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8623 goto onError;
8624 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8625 goto onError;
8626 return;
8627 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008628 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630}
8631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632/* error handling callback helper:
8633 build arguments, call the callback and check the arguments,
8634 put the result into newpos and return the replacement string, which
8635 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static PyObject *
8637unicode_translate_call_errorhandler(const char *errors,
8638 PyObject **errorHandler,
8639 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641 Py_ssize_t startpos, Py_ssize_t endpos,
8642 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008644 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008646 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 PyObject *restuple;
8648 PyObject *resunicode;
8649
8650 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 }
8655
8656 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008661 restuple = PyObject_CallFunctionObjArgs(
8662 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008666 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 Py_DECREF(restuple);
8668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008670 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 &resunicode, &i_newpos)) {
8672 Py_DECREF(restuple);
8673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008675 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008677 else
8678 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008680 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(restuple);
8682 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 Py_INCREF(resunicode);
8685 Py_DECREF(restuple);
8686 return resunicode;
8687}
8688
8689/* Lookup the character ch in the mapping and put the result in result,
8690 which must be decrefed by the caller.
8691 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694{
Christian Heimes217cfd12007-12-02 14:31:20 +00008695 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 PyObject *x;
8697
8698 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 x = PyObject_GetItem(mapping, w);
8701 Py_DECREF(w);
8702 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8704 /* No mapping found means: use 1:1 mapping. */
8705 PyErr_Clear();
8706 *result = NULL;
8707 return 0;
8708 } else
8709 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 }
8711 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 *result = x;
8713 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008715 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008717 if (value < 0 || value > MAX_UNICODE) {
8718 PyErr_Format(PyExc_ValueError,
8719 "character mapping must be in range(0x%x)",
8720 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(x);
8722 return -1;
8723 }
8724 *result = x;
8725 return 0;
8726 }
8727 else if (PyUnicode_Check(x)) {
8728 *result = x;
8729 return 0;
8730 }
8731 else {
8732 /* wrong return value */
8733 PyErr_SetString(PyExc_TypeError,
8734 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735 Py_DECREF(x);
8736 return -1;
8737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738}
Victor Stinner1194ea02014-04-04 19:37:40 +02008739
8740/* lookup the character, write the result into the writer.
8741 Return 1 if the result was written into the writer, return 0 if the mapping
8742 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008744charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8745 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746{
Victor Stinner1194ea02014-04-04 19:37:40 +02008747 PyObject *item;
8748
8749 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008751
8752 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008754 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008757 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008759
8760 if (item == Py_None) {
8761 Py_DECREF(item);
8762 return 0;
8763 }
8764
8765 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008766 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8767 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8768 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8770 Py_DECREF(item);
8771 return -1;
8772 }
8773 Py_DECREF(item);
8774 return 1;
8775 }
8776
8777 if (!PyUnicode_Check(item)) {
8778 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008780 }
8781
8782 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8783 Py_DECREF(item);
8784 return -1;
8785 }
8786
8787 Py_DECREF(item);
8788 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789}
8790
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791static int
8792unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8793 Py_UCS1 *translate)
8794{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008795 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008796 int ret = 0;
8797
Victor Stinner89a76ab2014-04-05 11:44:04 +02008798 if (charmaptranslate_lookup(ch, mapping, &item)) {
8799 return -1;
8800 }
8801
8802 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008803 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008804 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008806 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 /* not found => default to 1:1 mapping */
8808 translate[ch] = ch;
8809 return 1;
8810 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008811 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008812 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008813 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8814 used it */
8815 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 /* invalid character or character outside ASCII:
8817 skip the fast translate */
8818 goto exit;
8819 }
8820 translate[ch] = (Py_UCS1)replace;
8821 }
8822 else if (PyUnicode_Check(item)) {
8823 Py_UCS4 replace;
8824
8825 if (PyUnicode_READY(item) == -1) {
8826 Py_DECREF(item);
8827 return -1;
8828 }
8829 if (PyUnicode_GET_LENGTH(item) != 1)
8830 goto exit;
8831
8832 replace = PyUnicode_READ_CHAR(item, 0);
8833 if (replace > 127)
8834 goto exit;
8835 translate[ch] = (Py_UCS1)replace;
8836 }
8837 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008838 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 goto exit;
8840 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 ret = 1;
8842
Benjamin Peterson1365de72014-04-07 20:15:41 -04008843 exit:
8844 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 return ret;
8846}
8847
8848/* Fast path for ascii => ascii translation. Return 1 if the whole string
8849 was translated into writer, return 0 if the input string was partially
8850 translated into writer, raise an exception and return -1 on error. */
8851static int
8852unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008853 _PyUnicodeWriter *writer, int ignore,
8854 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008855{
Victor Stinner872b2912014-04-05 14:27:07 +02008856 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 Py_ssize_t len;
8858 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008859 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 len = PyUnicode_GET_LENGTH(input);
8862
Victor Stinner872b2912014-04-05 14:27:07 +02008863 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864
8865 in = PyUnicode_1BYTE_DATA(input);
8866 end = in + len;
8867
8868 assert(PyUnicode_IS_ASCII(writer->buffer));
8869 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8870 out = PyUnicode_1BYTE_DATA(writer->buffer);
8871
Victor Stinner872b2912014-04-05 14:27:07 +02008872 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008874 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008876 int translate = unicode_fast_translate_lookup(mapping, ch,
8877 ascii_table);
8878 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008880 if (translate == 0)
8881 goto exit;
8882 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 }
Victor Stinner872b2912014-04-05 14:27:07 +02008884 if (ch2 == 0xfe) {
8885 if (ignore)
8886 continue;
8887 goto exit;
8888 }
8889 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 }
Victor Stinner872b2912014-04-05 14:27:07 +02008893 res = 1;
8894
8895exit:
8896 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008897 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008898 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899}
8900
Victor Stinner3222da22015-10-01 22:07:32 +02008901static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902_PyUnicode_TranslateCharmap(PyObject *input,
8903 PyObject *mapping,
8904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008907 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 Py_ssize_t size, i;
8909 int kind;
8910 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008911 _PyUnicodeWriter writer;
8912 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913 char *reason = "character maps to <undefined>";
8914 PyObject *errorHandler = NULL;
8915 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008916 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 PyErr_BadArgument();
8921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 if (PyUnicode_READY(input) == -1)
8925 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 kind = PyUnicode_KIND(input);
8928 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008930 if (size == 0)
8931 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933 /* allocate enough for a simple 1:1 translation without
8934 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008935 _PyUnicodeWriter_Init(&writer);
8936 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Victor Stinner872b2912014-04-05 14:27:07 +02008939 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8940
Victor Stinner33798672016-03-01 21:59:58 +01008941 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008942 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008943 if (PyUnicode_IS_ASCII(input)) {
8944 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8945 if (res < 0) {
8946 _PyUnicodeWriter_Dealloc(&writer);
8947 return NULL;
8948 }
8949 if (res == 1)
8950 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008951 }
Victor Stinner33798672016-03-01 21:59:58 +01008952 else {
8953 i = 0;
8954 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 int translate;
8959 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8960 Py_ssize_t newpos;
8961 /* startpos for collecting untranslatable chars */
8962 Py_ssize_t collstart;
8963 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 ch = PyUnicode_READ(kind, data, i);
8967 translate = charmaptranslate_output(ch, mapping, &writer);
8968 if (translate < 0)
8969 goto onError;
8970
8971 if (translate != 0) {
8972 /* it worked => adjust input pointer */
8973 ++i;
8974 continue;
8975 }
8976
8977 /* untranslatable character */
8978 collstart = i;
8979 collend = i+1;
8980
8981 /* find all untranslatable characters */
8982 while (collend < size) {
8983 PyObject *x;
8984 ch = PyUnicode_READ(kind, data, collend);
8985 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008986 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 Py_XDECREF(x);
8988 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 ++collend;
8991 }
8992
8993 if (ignore) {
8994 i = collend;
8995 }
8996 else {
8997 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8998 reason, input, &exc,
8999 collstart, collend, &newpos);
9000 if (repunicode == NULL)
9001 goto onError;
9002 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009005 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009006 Py_DECREF(repunicode);
9007 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 }
9009 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009010 Py_XDECREF(exc);
9011 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009015 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009016 Py_XDECREF(exc);
9017 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 return NULL;
9019}
9020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021/* Deprecated. Use PyUnicode_Translate instead. */
9022PyObject *
9023PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9024 Py_ssize_t size,
9025 PyObject *mapping,
9026 const char *errors)
9027{
Christian Heimes5f520f42012-09-11 14:03:25 +02009028 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009029 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (!unicode)
9031 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009032 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9033 Py_DECREF(unicode);
9034 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035}
9036
Alexander Belopolsky40018472011-02-26 01:02:56 +00009037PyObject *
9038PyUnicode_Translate(PyObject *str,
9039 PyObject *mapping,
9040 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009042 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009043 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009044 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045}
Tim Petersced69f82003-09-16 20:30:58 +00009046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009048fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049{
9050 /* No need to call PyUnicode_READY(self) because this function is only
9051 called as a callback from fixup() which does it already. */
9052 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9053 const int kind = PyUnicode_KIND(self);
9054 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009055 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009056 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 Py_ssize_t i;
9058
9059 for (i = 0; i < len; ++i) {
9060 ch = PyUnicode_READ(kind, data, i);
9061 fixed = 0;
9062 if (ch > 127) {
9063 if (Py_UNICODE_ISSPACE(ch))
9064 fixed = ' ';
9065 else {
9066 const int decimal = Py_UNICODE_TODECIMAL(ch);
9067 if (decimal >= 0)
9068 fixed = '0' + decimal;
9069 }
9070 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009071 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009072 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 PyUnicode_WRITE(kind, data, i, fixed);
9074 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009075 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009076 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 }
9079
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009080 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081}
9082
9083PyObject *
9084_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9085{
9086 if (!PyUnicode_Check(unicode)) {
9087 PyErr_BadInternalCall();
9088 return NULL;
9089 }
9090 if (PyUnicode_READY(unicode) == -1)
9091 return NULL;
9092 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9093 /* If the string is already ASCII, just return the same string */
9094 Py_INCREF(unicode);
9095 return unicode;
9096 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009097 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098}
9099
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009100PyObject *
9101PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9102 Py_ssize_t length)
9103{
Victor Stinnerf0124502011-11-21 23:12:56 +01009104 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009105 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009106 Py_UCS4 maxchar;
9107 enum PyUnicode_Kind kind;
9108 void *data;
9109
Victor Stinner99d7ad02012-02-22 13:37:39 +01009110 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009111 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009112 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009113 if (ch > 127) {
9114 int decimal = Py_UNICODE_TODECIMAL(ch);
9115 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009116 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009117 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118 }
9119 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009120
9121 /* Copy to a new string */
9122 decimal = PyUnicode_New(length, maxchar);
9123 if (decimal == NULL)
9124 return decimal;
9125 kind = PyUnicode_KIND(decimal);
9126 data = PyUnicode_DATA(decimal);
9127 /* Iterate over code points */
9128 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009129 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009130 if (ch > 127) {
9131 int decimal = Py_UNICODE_TODECIMAL(ch);
9132 if (decimal >= 0)
9133 ch = '0' + decimal;
9134 }
9135 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009137 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009138}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009139/* --- Decimal Encoder ---------------------------------------------------- */
9140
Alexander Belopolsky40018472011-02-26 01:02:56 +00009141int
9142PyUnicode_EncodeDecimal(Py_UNICODE *s,
9143 Py_ssize_t length,
9144 char *output,
9145 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009146{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009147 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009148 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009149 enum PyUnicode_Kind kind;
9150 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009151
9152 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 PyErr_BadArgument();
9154 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009155 }
9156
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009157 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009158 if (unicode == NULL)
9159 return -1;
9160
Victor Stinner42bf7752011-11-21 22:52:58 +01009161 kind = PyUnicode_KIND(unicode);
9162 data = PyUnicode_DATA(unicode);
9163
Victor Stinnerb84d7232011-11-22 01:50:07 +01009164 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009165 PyObject *exc;
9166 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009168 Py_ssize_t startpos;
9169
9170 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009171
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009173 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009174 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 decimal = Py_UNICODE_TODECIMAL(ch);
9178 if (decimal >= 0) {
9179 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009180 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 continue;
9182 }
9183 if (0 < ch && ch < 256) {
9184 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 continue;
9187 }
Victor Stinner6345be92011-11-25 20:09:01 +01009188
Victor Stinner42bf7752011-11-21 22:52:58 +01009189 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009190 exc = NULL;
9191 raise_encode_exception(&exc, "decimal", unicode,
9192 startpos, startpos+1,
9193 "invalid decimal Unicode string");
9194 Py_XDECREF(exc);
9195 Py_DECREF(unicode);
9196 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009197 }
9198 /* 0-terminate the output string */
9199 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009200 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009201 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202}
9203
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204/* --- Helpers ------------------------------------------------------------ */
9205
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009206/* helper macro to fixup start/end slice values */
9207#define ADJUST_INDICES(start, end, len) \
9208 if (end > len) \
9209 end = len; \
9210 else if (end < 0) { \
9211 end += len; \
9212 if (end < 0) \
9213 end = 0; \
9214 } \
9215 if (start < 0) { \
9216 start += len; \
9217 if (start < 0) \
9218 start = 0; \
9219 }
9220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009222any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009224 Py_ssize_t end,
9225 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009227 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 void *buf1, *buf2;
9229 Py_ssize_t len1, len2, result;
9230
9231 kind1 = PyUnicode_KIND(s1);
9232 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009233 if (kind1 < kind2)
9234 return -1;
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 len1 = PyUnicode_GET_LENGTH(s1);
9237 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009238 ADJUST_INDICES(start, end, len1);
9239 if (end - start < len2)
9240 return -1;
9241
9242 buf1 = PyUnicode_DATA(s1);
9243 buf2 = PyUnicode_DATA(s2);
9244 if (len2 == 1) {
9245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9246 result = findchar((const char *)buf1 + kind1*start,
9247 kind1, end - start, ch, direction);
9248 if (result == -1)
9249 return -1;
9250 else
9251 return start + result;
9252 }
9253
9254 if (kind2 != kind1) {
9255 buf2 = _PyUnicode_AsKind(s2, kind1);
9256 if (!buf2)
9257 return -2;
9258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259
Victor Stinner794d5672011-10-10 03:21:36 +02009260 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009261 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009262 case PyUnicode_1BYTE_KIND:
9263 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9264 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9265 else
9266 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9267 break;
9268 case PyUnicode_2BYTE_KIND:
9269 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9270 break;
9271 case PyUnicode_4BYTE_KIND:
9272 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9273 break;
9274 default:
9275 assert(0); result = -2;
9276 }
9277 }
9278 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 }
9296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 PyMem_Free(buf2);
9299
9300 return result;
9301}
9302
9303Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009304_PyUnicode_InsertThousandsGrouping(
9305 PyObject *unicode, Py_ssize_t index,
9306 Py_ssize_t n_buffer,
9307 void *digits, Py_ssize_t n_digits,
9308 Py_ssize_t min_width,
9309 const char *grouping, PyObject *thousands_sep,
9310 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311{
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009313 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 Py_ssize_t thousands_sep_len;
9315 Py_ssize_t len;
9316
9317 if (unicode != NULL) {
9318 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009319 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 }
9321 else {
9322 kind = PyUnicode_1BYTE_KIND;
9323 data = NULL;
9324 }
9325 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9326 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9327 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9328 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009329 if (thousands_sep_kind < kind) {
9330 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9331 if (!thousands_sep_data)
9332 return -1;
9333 }
9334 else {
9335 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9336 if (!data)
9337 return -1;
9338 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340
Benjamin Petersonead6b532011-12-20 17:23:42 -06009341 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009343 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009345 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009348 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009350 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009352 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009356 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009362 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 break;
9366 default:
9367 assert(0);
9368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009370 if (unicode != NULL && thousands_sep_kind != kind) {
9371 if (thousands_sep_kind < kind)
9372 PyMem_Free(thousands_sep_data);
9373 else
9374 PyMem_Free(data);
9375 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 if (unicode == NULL) {
9377 *maxchar = 127;
9378 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009379 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009380 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 }
9382 }
9383 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384}
9385
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387Py_ssize_t
9388PyUnicode_Count(PyObject *str,
9389 PyObject *substr,
9390 Py_ssize_t start,
9391 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009393 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009394 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 void *buf1 = NULL, *buf2 = NULL;
9396 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009398 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009401 kind1 = PyUnicode_KIND(str);
9402 kind2 = PyUnicode_KIND(substr);
9403 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009406 len1 = PyUnicode_GET_LENGTH(str);
9407 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009410 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 buf1 = PyUnicode_DATA(str);
9413 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 if (!buf2)
9417 goto onError;
9418 }
9419
9420 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009423 result = asciilib_count(
9424 ((Py_UCS1*)buf1) + start, end - start,
9425 buf2, len2, PY_SSIZE_T_MAX
9426 );
9427 else
9428 result = ucs1lib_count(
9429 ((Py_UCS1*)buf1) + start, end - start,
9430 buf2, len2, PY_SSIZE_T_MAX
9431 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 break;
9433 case PyUnicode_2BYTE_KIND:
9434 result = ucs2lib_count(
9435 ((Py_UCS2*)buf1) + start, end - start,
9436 buf2, len2, PY_SSIZE_T_MAX
9437 );
9438 break;
9439 case PyUnicode_4BYTE_KIND:
9440 result = ucs4lib_count(
9441 ((Py_UCS4*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 break;
9445 default:
9446 assert(0); result = 0;
9447 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyMem_Free(buf2);
9451
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009454 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 PyMem_Free(buf2);
9456 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459Py_ssize_t
9460PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009461 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009462 Py_ssize_t start,
9463 Py_ssize_t end,
9464 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009466 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009468
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009469 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472Py_ssize_t
9473PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9474 Py_ssize_t start, Py_ssize_t end,
9475 int direction)
9476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009478 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (PyUnicode_READY(str) == -1)
9480 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009481 len = PyUnicode_GET_LENGTH(str);
9482 ADJUST_INDICES(start, end, len);
9483 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009484 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009486 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9487 kind, end-start, ch, direction);
9488 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009490 else
9491 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492}
9493
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009495tailmatch(PyObject *self,
9496 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009497 Py_ssize_t start,
9498 Py_ssize_t end,
9499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 int kind_self;
9502 int kind_sub;
9503 void *data_self;
9504 void *data_sub;
9505 Py_ssize_t offset;
9506 Py_ssize_t i;
9507 Py_ssize_t end_sub;
9508
9509 if (PyUnicode_READY(self) == -1 ||
9510 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9514 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009518 if (PyUnicode_GET_LENGTH(substring) == 0)
9519 return 1;
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 kind_self = PyUnicode_KIND(self);
9522 data_self = PyUnicode_DATA(self);
9523 kind_sub = PyUnicode_KIND(substring);
9524 data_sub = PyUnicode_DATA(substring);
9525 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9526
9527 if (direction > 0)
9528 offset = end;
9529 else
9530 offset = start;
9531
9532 if (PyUnicode_READ(kind_self, data_self, offset) ==
9533 PyUnicode_READ(kind_sub, data_sub, 0) &&
9534 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9535 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9536 /* If both are of the same kind, memcmp is sufficient */
9537 if (kind_self == kind_sub) {
9538 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009539 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 data_sub,
9541 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009542 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009544 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 else {
9546 /* We do not need to compare 0 and len(substring)-1 because
9547 the if statement above ensured already that they are equal
9548 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 for (i = 1; i < end_sub; ++i) {
9550 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9551 PyUnicode_READ(kind_sub, data_sub, i))
9552 return 0;
9553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 }
9557
9558 return 0;
9559}
9560
Alexander Belopolsky40018472011-02-26 01:02:56 +00009561Py_ssize_t
9562PyUnicode_Tailmatch(PyObject *str,
9563 PyObject *substr,
9564 Py_ssize_t start,
9565 Py_ssize_t end,
9566 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009568 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009571 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572}
9573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574/* Apply fixfct filter to the Unicode object self and return a
9575 reference to the modified object */
9576
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009578fixup(PyObject *self,
9579 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 PyObject *u;
9582 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009583 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009585 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009588 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 /* fix functions return the new maximum character in a string,
9591 if the kind of the resulting unicode object does not change,
9592 everything is fine. Otherwise we need to change the string kind
9593 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009594 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009595
9596 if (maxchar_new == 0) {
9597 /* no changes */;
9598 if (PyUnicode_CheckExact(self)) {
9599 Py_DECREF(u);
9600 Py_INCREF(self);
9601 return self;
9602 }
9603 else
9604 return u;
9605 }
9606
Victor Stinnere6abb482012-05-02 01:15:40 +02009607 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608
Victor Stinnereaab6042011-12-11 22:22:39 +01009609 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009611
9612 /* In case the maximum character changed, we need to
9613 convert the string to the new category. */
9614 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9615 if (v == NULL) {
9616 Py_DECREF(u);
9617 return NULL;
9618 }
9619 if (maxchar_new > maxchar_old) {
9620 /* If the maxchar increased so that the kind changed, not all
9621 characters are representable anymore and we need to fix the
9622 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009623 _PyUnicode_FastCopyCharacters(v, 0,
9624 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009625 maxchar_old = fixfct(v);
9626 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 }
9628 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009629 _PyUnicode_FastCopyCharacters(v, 0,
9630 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009632 Py_DECREF(u);
9633 assert(_PyUnicode_CheckConsistency(v, 1));
9634 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9641 char *resdata, *data = PyUnicode_DATA(self);
9642 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009643
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 res = PyUnicode_New(len, 127);
9645 if (res == NULL)
9646 return NULL;
9647 resdata = PyUnicode_DATA(res);
9648 if (lower)
9649 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 _Py_bytes_upper(resdata, data, len);
9652 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t j;
9659 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009660 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9664
9665 where ! is a negation and \p{xxx} is a character with property xxx.
9666 */
9667 for (j = i - 1; j >= 0; j--) {
9668 c = PyUnicode_READ(kind, data, j);
9669 if (!_PyUnicode_IsCaseIgnorable(c))
9670 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9673 if (final_sigma) {
9674 for (j = i + 1; j < length; j++) {
9675 c = PyUnicode_READ(kind, data, j);
9676 if (!_PyUnicode_IsCaseIgnorable(c))
9677 break;
9678 }
9679 final_sigma = j == length || !_PyUnicode_IsCased(c);
9680 }
9681 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682}
9683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684static int
9685lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9686 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 /* Obscure special case. */
9689 if (c == 0x3A3) {
9690 mapped[0] = handle_capital_sigma(kind, data, length, i);
9691 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696static Py_ssize_t
9697do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 Py_ssize_t i, k = 0;
9700 int n_res, j;
9701 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009702
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 c = PyUnicode_READ(kind, data, 0);
9704 n_res = _PyUnicode_ToUpperFull(c, mapped);
9705 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009706 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 for (i = 1; i < length; i++) {
9710 c = PyUnicode_READ(kind, data, i);
9711 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009713 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009715 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009716 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720static Py_ssize_t
9721do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9722 Py_ssize_t i, k = 0;
9723
9724 for (i = 0; i < length; i++) {
9725 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9726 int n_res, j;
9727 if (Py_UNICODE_ISUPPER(c)) {
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 }
9730 else if (Py_UNICODE_ISLOWER(c)) {
9731 n_res = _PyUnicode_ToUpperFull(c, mapped);
9732 }
9733 else {
9734 n_res = 1;
9735 mapped[0] = c;
9736 }
9737 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009738 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 res[k++] = mapped[j];
9740 }
9741 }
9742 return k;
9743}
9744
9745static Py_ssize_t
9746do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9747 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 Py_ssize_t i, k = 0;
9750
9751 for (i = 0; i < length; i++) {
9752 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9753 int n_res, j;
9754 if (lower)
9755 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9756 else
9757 n_res = _PyUnicode_ToUpperFull(c, mapped);
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
9761 }
9762 }
9763 return k;
9764}
9765
9766static Py_ssize_t
9767do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768{
9769 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9770}
9771
9772static Py_ssize_t
9773do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9774{
9775 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9776}
9777
Benjamin Petersone51757f2012-01-12 21:10:29 -05009778static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009779do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780{
9781 Py_ssize_t i, k = 0;
9782
9783 for (i = 0; i < length; i++) {
9784 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785 Py_UCS4 mapped[3];
9786 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9787 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009788 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009789 res[k++] = mapped[j];
9790 }
9791 }
9792 return k;
9793}
9794
9795static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799 int previous_is_cased;
9800
9801 previous_is_cased = 0;
9802 for (i = 0; i < length; i++) {
9803 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804 Py_UCS4 mapped[3];
9805 int n_res, j;
9806
9807 if (previous_is_cased)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToTitleFull(c, mapped);
9811
9812 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009813 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814 res[k++] = mapped[j];
9815 }
9816
9817 previous_is_cased = _PyUnicode_IsCased(c);
9818 }
9819 return k;
9820}
9821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822static PyObject *
9823case_operation(PyObject *self,
9824 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825{
9826 PyObject *res = NULL;
9827 Py_ssize_t length, newlength = 0;
9828 int kind, outkind;
9829 void *data, *outdata;
9830 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831
Benjamin Petersoneea48462012-01-16 14:28:50 -05009832 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833
9834 kind = PyUnicode_KIND(self);
9835 data = PyUnicode_DATA(self);
9836 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009837 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009838 PyErr_SetString(PyExc_OverflowError, "string is too long");
9839 return NULL;
9840 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009841 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842 if (tmp == NULL)
9843 return PyErr_NoMemory();
9844 newlength = perform(kind, data, length, tmp, &maxchar);
9845 res = PyUnicode_New(newlength, maxchar);
9846 if (res == NULL)
9847 goto leave;
9848 tmpend = tmp + newlength;
9849 outdata = PyUnicode_DATA(res);
9850 outkind = PyUnicode_KIND(res);
9851 switch (outkind) {
9852 case PyUnicode_1BYTE_KIND:
9853 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9854 break;
9855 case PyUnicode_2BYTE_KIND:
9856 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9857 break;
9858 case PyUnicode_4BYTE_KIND:
9859 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9860 break;
9861 default:
9862 assert(0);
9863 break;
9864 }
9865 leave:
9866 PyMem_FREE(tmp);
9867 return res;
9868}
9869
Tim Peters8ce9f162004-08-27 01:49:32 +00009870PyObject *
9871PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009873 PyObject *res;
9874 PyObject *fseq;
9875 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009876 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009878 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009879 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009881 }
9882
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 /* NOTE: the following code can't call back into Python code,
9884 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009885 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009886
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009887 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 res = _PyUnicode_JoinArray(separator, items, seqlen);
9890 Py_DECREF(fseq);
9891 return res;
9892}
9893
9894PyObject *
9895_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9896{
9897 PyObject *res = NULL; /* the result */
9898 PyObject *sep = NULL;
9899 Py_ssize_t seplen;
9900 PyObject *item;
9901 Py_ssize_t sz, i, res_offset;
9902 Py_UCS4 maxchar;
9903 Py_UCS4 item_maxchar;
9904 int use_memcpy;
9905 unsigned char *res_data = NULL, *sep_data = NULL;
9906 PyObject *last_obj;
9907 unsigned int kind = 0;
9908
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 /* If empty sequence, return u"". */
9910 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009911 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009912 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009913
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009915 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 if (seqlen == 1) {
9917 if (PyUnicode_CheckExact(items[0])) {
9918 res = items[0];
9919 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 return res;
9921 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009922 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009923 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009924 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009926 /* Set up sep and seplen */
9927 if (separator == NULL) {
9928 /* fall back to a blank space separator */
9929 sep = PyUnicode_FromOrdinal(' ');
9930 if (!sep)
9931 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 else {
9936 if (!PyUnicode_Check(separator)) {
9937 PyErr_Format(PyExc_TypeError,
9938 "separator: expected str instance,"
9939 " %.80s found",
9940 Py_TYPE(separator)->tp_name);
9941 goto onError;
9942 }
9943 if (PyUnicode_READY(separator))
9944 goto onError;
9945 sep = separator;
9946 seplen = PyUnicode_GET_LENGTH(separator);
9947 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9948 /* inc refcount to keep this code path symmetric with the
9949 above case of a blank separator */
9950 Py_INCREF(sep);
9951 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009952 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
9954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* There are at least two things to join, or else we have a subclass
9956 * of str in the sequence.
9957 * Do a pre-pass to figure out the total amount of space we'll
9958 * need (sz), and see whether all argument are strings.
9959 */
9960 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009961#ifdef Py_DEBUG
9962 use_memcpy = 0;
9963#else
9964 use_memcpy = 1;
9965#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009967 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 if (!PyUnicode_Check(item)) {
9970 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009971 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 " %.80s found",
9973 i, Py_TYPE(item)->tp_name);
9974 goto onError;
9975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_READY(item) == -1)
9977 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009978 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009980 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009981 if (i != 0) {
9982 add_sz += seplen;
9983 }
9984 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 goto onError;
9988 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009989 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009990 if (use_memcpy && last_obj != NULL) {
9991 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9992 use_memcpy = 0;
9993 }
9994 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 }
Tim Petersced69f82003-09-16 20:30:58 +00009996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 if (res == NULL)
9999 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010000
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010002#ifdef Py_DEBUG
10003 use_memcpy = 0;
10004#else
10005 if (use_memcpy) {
10006 res_data = PyUnicode_1BYTE_DATA(res);
10007 kind = PyUnicode_KIND(res);
10008 if (seplen != 0)
10009 sep_data = PyUnicode_1BYTE_DATA(sep);
10010 }
10011#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010012 if (use_memcpy) {
10013 for (i = 0; i < seqlen; ++i) {
10014 Py_ssize_t itemlen;
10015 item = items[i];
10016
10017 /* Copy item, and maybe the separator. */
10018 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010019 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010021 kind * seplen);
10022 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010023 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010024
10025 itemlen = PyUnicode_GET_LENGTH(item);
10026 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010027 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010029 kind * itemlen);
10030 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010032 }
10033 assert(res_data == PyUnicode_1BYTE_DATA(res)
10034 + kind * PyUnicode_GET_LENGTH(res));
10035 }
10036 else {
10037 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10038 Py_ssize_t itemlen;
10039 item = items[i];
10040
10041 /* Copy item, and maybe the separator. */
10042 if (i && seplen != 0) {
10043 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10044 res_offset += seplen;
10045 }
10046
10047 itemlen = PyUnicode_GET_LENGTH(item);
10048 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010049 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 res_offset += itemlen;
10051 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010052 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010054 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010057 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059
Benjamin Peterson29060642009-01-31 22:14:21 +000010060 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010062 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063 return NULL;
10064}
10065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066#define FILL(kind, data, value, start, length) \
10067 do { \
10068 Py_ssize_t i_ = 0; \
10069 assert(kind != PyUnicode_WCHAR_KIND); \
10070 switch ((kind)) { \
10071 case PyUnicode_1BYTE_KIND: { \
10072 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010073 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 break; \
10075 } \
10076 case PyUnicode_2BYTE_KIND: { \
10077 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10078 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10079 break; \
10080 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010081 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10083 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10084 break; \
10085 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010086 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 } \
10088 } while (0)
10089
Victor Stinnerd3f08822012-05-29 12:57:52 +020010090void
10091_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10092 Py_UCS4 fill_char)
10093{
10094 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10095 const void *data = PyUnicode_DATA(unicode);
10096 assert(PyUnicode_IS_READY(unicode));
10097 assert(unicode_modifiable(unicode));
10098 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10099 assert(start >= 0);
10100 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10101 FILL(kind, data, fill_char, start, length);
10102}
10103
Victor Stinner3fe55312012-01-04 00:33:50 +010010104Py_ssize_t
10105PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10106 Py_UCS4 fill_char)
10107{
10108 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010109
10110 if (!PyUnicode_Check(unicode)) {
10111 PyErr_BadInternalCall();
10112 return -1;
10113 }
10114 if (PyUnicode_READY(unicode) == -1)
10115 return -1;
10116 if (unicode_check_modifiable(unicode))
10117 return -1;
10118
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119 if (start < 0) {
10120 PyErr_SetString(PyExc_IndexError, "string index out of range");
10121 return -1;
10122 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010123 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10124 PyErr_SetString(PyExc_ValueError,
10125 "fill character is bigger than "
10126 "the string maximum character");
10127 return -1;
10128 }
10129
10130 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10131 length = Py_MIN(maxlen, length);
10132 if (length <= 0)
10133 return 0;
10134
Victor Stinnerd3f08822012-05-29 12:57:52 +020010135 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010136 return length;
10137}
10138
Victor Stinner9310abb2011-10-05 00:59:23 +020010139static PyObject *
10140pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010141 Py_ssize_t left,
10142 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 PyObject *u;
10146 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010147 int kind;
10148 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
10150 if (left < 0)
10151 left = 0;
10152 if (right < 0)
10153 right = 0;
10154
Victor Stinnerc4b49542011-12-11 22:44:26 +010010155 if (left == 0 && right == 0)
10156 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10159 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010160 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10161 return NULL;
10162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010164 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010166 if (!u)
10167 return NULL;
10168
10169 kind = PyUnicode_KIND(u);
10170 data = PyUnicode_DATA(u);
10171 if (left)
10172 FILL(kind, data, fill, 0, left);
10173 if (right)
10174 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010175 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010176 assert(_PyUnicode_CheckConsistency(u, 1));
10177 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178}
10179
Alexander Belopolsky40018472011-02-26 01:02:56 +000010180PyObject *
10181PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010185 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187
Benjamin Petersonead6b532011-12-20 17:23:42 -060010188 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 if (PyUnicode_IS_ASCII(string))
10191 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(string), keepends);
10194 else
10195 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 break;
10199 case PyUnicode_2BYTE_KIND:
10200 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(string), keepends);
10203 break;
10204 case PyUnicode_4BYTE_KIND:
10205 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 PyUnicode_GET_LENGTH(string), keepends);
10208 break;
10209 default:
10210 assert(0);
10211 list = 0;
10212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214}
10215
Alexander Belopolsky40018472011-02-26 01:02:56 +000010216static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010217split(PyObject *self,
10218 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010219 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010221 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 void *buf1, *buf2;
10223 Py_ssize_t len1, len2;
10224 PyObject* out;
10225
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010227 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (PyUnicode_READY(self) == -1)
10230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010233 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 if (PyUnicode_IS_ASCII(self))
10236 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010238 PyUnicode_GET_LENGTH(self), maxcount
10239 );
10240 else
10241 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010242 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010243 PyUnicode_GET_LENGTH(self), maxcount
10244 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_2BYTE_KIND:
10246 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 PyUnicode_GET_LENGTH(self), maxcount
10249 );
10250 case PyUnicode_4BYTE_KIND:
10251 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyUnicode_GET_LENGTH(self), maxcount
10254 );
10255 default:
10256 assert(0);
10257 return NULL;
10258 }
10259
10260 if (PyUnicode_READY(substring) == -1)
10261 return NULL;
10262
10263 kind1 = PyUnicode_KIND(self);
10264 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 len1 = PyUnicode_GET_LENGTH(self);
10266 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010267 if (kind1 < kind2 || len1 < len2) {
10268 out = PyList_New(1);
10269 if (out == NULL)
10270 return NULL;
10271 Py_INCREF(self);
10272 PyList_SET_ITEM(out, 0, self);
10273 return out;
10274 }
10275 buf1 = PyUnicode_DATA(self);
10276 buf2 = PyUnicode_DATA(substring);
10277 if (kind2 != kind1) {
10278 buf2 = _PyUnicode_AsKind(substring, kind1);
10279 if (!buf2)
10280 return NULL;
10281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010283 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10286 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 else
10289 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 break;
10292 case PyUnicode_2BYTE_KIND:
10293 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 break;
10296 case PyUnicode_4BYTE_KIND:
10297 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 break;
10300 default:
10301 out = NULL;
10302 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 PyMem_Free(buf2);
10305 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010309rsplit(PyObject *self,
10310 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010311 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010312{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010313 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 void *buf1, *buf2;
10315 Py_ssize_t len1, len2;
10316 PyObject* out;
10317
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010318 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010319 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (PyUnicode_READY(self) == -1)
10322 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010325 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 if (PyUnicode_IS_ASCII(self))
10328 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 else
10333 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_2BYTE_KIND:
10338 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 PyUnicode_GET_LENGTH(self), maxcount
10341 );
10342 case PyUnicode_4BYTE_KIND:
10343 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
10347 default:
10348 assert(0);
10349 return NULL;
10350 }
10351
10352 if (PyUnicode_READY(substring) == -1)
10353 return NULL;
10354
10355 kind1 = PyUnicode_KIND(self);
10356 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 len1 = PyUnicode_GET_LENGTH(self);
10358 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind1 < kind2 || len1 < len2) {
10360 out = PyList_New(1);
10361 if (out == NULL)
10362 return NULL;
10363 Py_INCREF(self);
10364 PyList_SET_ITEM(out, 0, self);
10365 return out;
10366 }
10367 buf1 = PyUnicode_DATA(self);
10368 buf2 = PyUnicode_DATA(substring);
10369 if (kind2 != kind1) {
10370 buf2 = _PyUnicode_AsKind(substring, kind1);
10371 if (!buf2)
10372 return NULL;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010375 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10378 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 else
10381 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_2BYTE_KIND:
10385 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 case PyUnicode_4BYTE_KIND:
10389 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 default:
10393 out = NULL;
10394 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyMem_Free(buf2);
10397 return out;
10398}
10399
10400static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010401anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10402 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010404 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010406 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10407 return asciilib_find(buf1, len1, buf2, len2, offset);
10408 else
10409 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 case PyUnicode_2BYTE_KIND:
10411 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10412 case PyUnicode_4BYTE_KIND:
10413 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10414 }
10415 assert(0);
10416 return -1;
10417}
10418
10419static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10421 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010423 switch (kind) {
10424 case PyUnicode_1BYTE_KIND:
10425 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10426 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10427 else
10428 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10431 case PyUnicode_4BYTE_KIND:
10432 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10433 }
10434 assert(0);
10435 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010436}
10437
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010438static void
10439replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10440 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10441{
10442 int kind = PyUnicode_KIND(u);
10443 void *data = PyUnicode_DATA(u);
10444 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10445 if (kind == PyUnicode_1BYTE_KIND) {
10446 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10447 (Py_UCS1 *)data + len,
10448 u1, u2, maxcount);
10449 }
10450 else if (kind == PyUnicode_2BYTE_KIND) {
10451 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10452 (Py_UCS2 *)data + len,
10453 u1, u2, maxcount);
10454 }
10455 else {
10456 assert(kind == PyUnicode_4BYTE_KIND);
10457 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10458 (Py_UCS4 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461}
10462
Alexander Belopolsky40018472011-02-26 01:02:56 +000010463static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464replace(PyObject *self, PyObject *str1,
10465 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 PyObject *u;
10468 char *sbuf = PyUnicode_DATA(self);
10469 char *buf1 = PyUnicode_DATA(str1);
10470 char *buf2 = PyUnicode_DATA(str2);
10471 int srelease = 0, release1 = 0, release2 = 0;
10472 int skind = PyUnicode_KIND(self);
10473 int kind1 = PyUnicode_KIND(str1);
10474 int kind2 = PyUnicode_KIND(str2);
10475 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10476 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10477 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010479 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010482 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010484 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485
Victor Stinner59de0ee2011-10-07 10:01:28 +020010486 if (str1 == str2)
10487 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10491 if (maxchar < maxchar_str1)
10492 /* substring too wide to be present */
10493 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10495 /* Replacing str1 with str2 may cause a maxchar reduction in the
10496 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010498 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010503 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010507 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010508
Victor Stinner69ed0f42013-04-09 21:48:24 +020010509 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010510 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010511 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010513 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010517
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10519 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 }
10521 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 int rkind = skind;
10523 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010524 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (kind1 < rkind) {
10527 /* widen substring */
10528 buf1 = _PyUnicode_AsKind(str1, rkind);
10529 if (!buf1) goto error;
10530 release1 = 1;
10531 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010532 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 if (i < 0)
10534 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (rkind > kind2) {
10536 /* widen replacement */
10537 buf2 = _PyUnicode_AsKind(str2, rkind);
10538 if (!buf2) goto error;
10539 release2 = 1;
10540 }
10541 else if (rkind < kind2) {
10542 /* widen self and buf1 */
10543 rkind = kind2;
10544 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010545 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 sbuf = _PyUnicode_AsKind(self, rkind);
10547 if (!sbuf) goto error;
10548 srelease = 1;
10549 buf1 = _PyUnicode_AsKind(str1, rkind);
10550 if (!buf1) goto error;
10551 release1 = 1;
10552 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010553 u = PyUnicode_New(slen, maxchar);
10554 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010556 assert(PyUnicode_KIND(u) == rkind);
10557 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010558
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010560 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010563 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010565
10566 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010567 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010568 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010569 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010570 if (i == -1)
10571 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 }
10579 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010581 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 int rkind = skind;
10583 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 buf1 = _PyUnicode_AsKind(str1, rkind);
10588 if (!buf1) goto error;
10589 release1 = 1;
10590 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 if (n == 0)
10593 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf2 = _PyUnicode_AsKind(str2, rkind);
10597 if (!buf2) goto error;
10598 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 rkind = kind2;
10603 sbuf = _PyUnicode_AsKind(self, rkind);
10604 if (!sbuf) goto error;
10605 srelease = 1;
10606 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010607 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf1 = _PyUnicode_AsKind(str1, rkind);
10609 if (!buf1) goto error;
10610 release1 = 1;
10611 }
10612 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10613 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010614 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 PyErr_SetString(PyExc_OverflowError,
10616 "replace string is too long");
10617 goto error;
10618 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010619 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010621 _Py_INCREF_UNICODE_EMPTY();
10622 if (!unicode_empty)
10623 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010624 u = unicode_empty;
10625 goto done;
10626 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010627 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 PyErr_SetString(PyExc_OverflowError,
10629 "replace string is too long");
10630 goto error;
10631 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 u = PyUnicode_New(new_size, maxchar);
10633 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 assert(PyUnicode_KIND(u) == rkind);
10636 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 ires = i = 0;
10638 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 while (n-- > 0) {
10640 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010641 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010643 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010644 if (j == -1)
10645 break;
10646 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
10649 sbuf + rkind * i,
10650 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 }
10653 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
10665 sbuf + rkind * i,
10666 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010667 }
10668 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 /* interleave */
10670 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 if (--n <= 0)
10676 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 memcpy(res + rkind * ires,
10678 sbuf + rkind * i,
10679 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 ires++;
10681 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
10684 sbuf + rkind * i,
10685 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 }
10688
10689 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010690 unicode_adjust_maxchar(&u);
10691 if (u == NULL)
10692 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010694
10695 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (srelease)
10697 PyMem_FREE(sbuf);
10698 if (release1)
10699 PyMem_FREE(buf1);
10700 if (release2)
10701 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010702 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010713 return unicode_result_unchanged(self);
10714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 error:
10716 if (srelease && sbuf)
10717 PyMem_FREE(sbuf);
10718 if (release1 && buf1)
10719 PyMem_FREE(buf1);
10720 if (release2 && buf2)
10721 PyMem_FREE(buf2);
10722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
10725/* --- Unicode Object Methods --------------------------------------------- */
10726
INADA Naoki3ae20562017-01-16 20:41:20 +090010727/*[clinic input]
10728str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
INADA Naoki3ae20562017-01-16 20:41:20 +090010730Return a version of the string where each word is titlecased.
10731
10732More specifically, words start with uppercased characters and all remaining
10733cased characters have lower case.
10734[clinic start generated code]*/
10735
10736static PyObject *
10737unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010738/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010740 if (PyUnicode_READY(self) == -1)
10741 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010742 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743}
10744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745/*[clinic input]
10746str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
INADA Naoki3ae20562017-01-16 20:41:20 +090010748Return a capitalized version of the string.
10749
10750More specifically, make the first character have upper case and the rest lower
10751case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_capitalize_impl(PyObject *self)
10756/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 if (PyUnicode_GET_LENGTH(self) == 0)
10761 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010762 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763}
10764
INADA Naoki3ae20562017-01-16 20:41:20 +090010765/*[clinic input]
10766str.casefold as unicode_casefold
10767
10768Return a version of the string suitable for caseless comparisons.
10769[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010770
10771static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010772unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010773/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010774{
10775 if (PyUnicode_READY(self) == -1)
10776 return NULL;
10777 if (PyUnicode_IS_ASCII(self))
10778 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010779 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010780}
10781
10782
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010783/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010784
10785static int
10786convert_uc(PyObject *obj, void *addr)
10787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010789
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010790 if (!PyUnicode_Check(obj)) {
10791 PyErr_Format(PyExc_TypeError,
10792 "The fill character must be a unicode character, "
10793 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 return 0;
10795 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010796 if (PyUnicode_READY(obj) < 0)
10797 return 0;
10798 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010799 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 return 0;
10802 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010803 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010805}
10806
INADA Naoki3ae20562017-01-16 20:41:20 +090010807/*[clinic input]
10808str.center as unicode_center
10809
10810 width: Py_ssize_t
10811 fillchar: Py_UCS4 = ' '
10812 /
10813
10814Return a centered string of length width.
10815
10816Padding is done using the specified fill character (default is a space).
10817[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
10819static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010820unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10821/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010823 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
Benjamin Petersonbac79492012-01-14 13:34:47 -050010825 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 return NULL;
10827
Victor Stinnerc4b49542011-12-11 22:44:26 +010010828 if (PyUnicode_GET_LENGTH(self) >= width)
10829 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Victor Stinnerc4b49542011-12-11 22:44:26 +010010831 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 left = marg / 2 + (marg & width & 1);
10833
Victor Stinner9310abb2011-10-05 00:59:23 +020010834 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835}
10836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837/* This function assumes that str1 and str2 are readied by the caller. */
10838
Marc-André Lemburge5034372000-08-08 08:04:29 +000010839static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010840unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842#define COMPARE(TYPE1, TYPE2) \
10843 do { \
10844 TYPE1* p1 = (TYPE1 *)data1; \
10845 TYPE2* p2 = (TYPE2 *)data2; \
10846 TYPE1* end = p1 + len; \
10847 Py_UCS4 c1, c2; \
10848 for (; p1 != end; p1++, p2++) { \
10849 c1 = *p1; \
10850 c2 = *p2; \
10851 if (c1 != c2) \
10852 return (c1 < c2) ? -1 : 1; \
10853 } \
10854 } \
10855 while (0)
10856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 int kind1, kind2;
10858 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 kind1 = PyUnicode_KIND(str1);
10862 kind2 = PyUnicode_KIND(str2);
10863 data1 = PyUnicode_DATA(str1);
10864 data2 = PyUnicode_DATA(str2);
10865 len1 = PyUnicode_GET_LENGTH(str1);
10866 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010867 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010868
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010869 switch(kind1) {
10870 case PyUnicode_1BYTE_KIND:
10871 {
10872 switch(kind2) {
10873 case PyUnicode_1BYTE_KIND:
10874 {
10875 int cmp = memcmp(data1, data2, len);
10876 /* normalize result of memcmp() into the range [-1; 1] */
10877 if (cmp < 0)
10878 return -1;
10879 if (cmp > 0)
10880 return 1;
10881 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010882 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 case PyUnicode_2BYTE_KIND:
10884 COMPARE(Py_UCS1, Py_UCS2);
10885 break;
10886 case PyUnicode_4BYTE_KIND:
10887 COMPARE(Py_UCS1, Py_UCS4);
10888 break;
10889 default:
10890 assert(0);
10891 }
10892 break;
10893 }
10894 case PyUnicode_2BYTE_KIND:
10895 {
10896 switch(kind2) {
10897 case PyUnicode_1BYTE_KIND:
10898 COMPARE(Py_UCS2, Py_UCS1);
10899 break;
10900 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010901 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 COMPARE(Py_UCS2, Py_UCS2);
10903 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010904 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010905 case PyUnicode_4BYTE_KIND:
10906 COMPARE(Py_UCS2, Py_UCS4);
10907 break;
10908 default:
10909 assert(0);
10910 }
10911 break;
10912 }
10913 case PyUnicode_4BYTE_KIND:
10914 {
10915 switch(kind2) {
10916 case PyUnicode_1BYTE_KIND:
10917 COMPARE(Py_UCS4, Py_UCS1);
10918 break;
10919 case PyUnicode_2BYTE_KIND:
10920 COMPARE(Py_UCS4, Py_UCS2);
10921 break;
10922 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 {
10924#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10925 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10926 /* normalize result of wmemcmp() into the range [-1; 1] */
10927 if (cmp < 0)
10928 return -1;
10929 if (cmp > 0)
10930 return 1;
10931#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010932 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010933#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936 default:
10937 assert(0);
10938 }
10939 break;
10940 }
10941 default:
10942 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010943 }
10944
Victor Stinner770e19e2012-10-04 22:59:45 +020010945 if (len1 == len2)
10946 return 0;
10947 if (len1 < len2)
10948 return -1;
10949 else
10950 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951
10952#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010953}
10954
Benjamin Peterson621b4302016-09-09 13:54:34 -070010955static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010956unicode_compare_eq(PyObject *str1, PyObject *str2)
10957{
10958 int kind;
10959 void *data1, *data2;
10960 Py_ssize_t len;
10961 int cmp;
10962
Victor Stinnere5567ad2012-10-23 02:48:49 +020010963 len = PyUnicode_GET_LENGTH(str1);
10964 if (PyUnicode_GET_LENGTH(str2) != len)
10965 return 0;
10966 kind = PyUnicode_KIND(str1);
10967 if (PyUnicode_KIND(str2) != kind)
10968 return 0;
10969 data1 = PyUnicode_DATA(str1);
10970 data2 = PyUnicode_DATA(str2);
10971
10972 cmp = memcmp(data1, data2, len * kind);
10973 return (cmp == 0);
10974}
10975
10976
Alexander Belopolsky40018472011-02-26 01:02:56 +000010977int
10978PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10981 if (PyUnicode_READY(left) == -1 ||
10982 PyUnicode_READY(right) == -1)
10983 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010984
10985 /* a string is equal to itself */
10986 if (left == right)
10987 return 0;
10988
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010989 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010991 PyErr_Format(PyExc_TypeError,
10992 "Can't compare %.100s and %.100s",
10993 left->ob_type->tp_name,
10994 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 return -1;
10996}
10997
Martin v. Löwis5b222132007-06-10 09:51:05 +000010998int
10999PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 Py_ssize_t i;
11002 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011004 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005
Victor Stinner910337b2011-10-03 03:20:16 +020011006 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011007 if (!PyUnicode_IS_READY(uni)) {
11008 const wchar_t *ws = _PyUnicode_WSTR(uni);
11009 /* Compare Unicode string and source character set string */
11010 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11011 if (chr != ustr[i])
11012 return (chr < ustr[i]) ? -1 : 1;
11013 }
11014 /* This check keeps Python strings that end in '\0' from comparing equal
11015 to C strings identical up to that point. */
11016 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11017 return 1; /* uni is longer */
11018 if (ustr[i])
11019 return -1; /* str is longer */
11020 return 0;
11021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011023 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011024 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011025 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011026 size_t len, len2 = strlen(str);
11027 int cmp;
11028
11029 len = Py_MIN(len1, len2);
11030 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011031 if (cmp != 0) {
11032 if (cmp < 0)
11033 return -1;
11034 else
11035 return 1;
11036 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 if (len1 > len2)
11038 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011039 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011040 return -1; /* str is longer */
11041 return 0;
11042 }
11043 else {
11044 void *data = PyUnicode_DATA(uni);
11045 /* Compare Unicode string and source character set string */
11046 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011047 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11049 /* This check keeps Python strings that end in '\0' from comparing equal
11050 to C strings identical up to that point. */
11051 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11052 return 1; /* uni is longer */
11053 if (str[i])
11054 return -1; /* str is longer */
11055 return 0;
11056 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011057}
11058
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011059static int
11060non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11061{
11062 size_t i, len;
11063 const wchar_t *p;
11064 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11065 if (strlen(str) != len)
11066 return 0;
11067 p = _PyUnicode_WSTR(unicode);
11068 assert(p);
11069 for (i = 0; i < len; i++) {
11070 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011071 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011072 return 0;
11073 }
11074 return 1;
11075}
11076
11077int
11078_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11079{
11080 size_t len;
11081 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011082 assert(str);
11083#ifndef NDEBUG
11084 for (const char *p = str; *p; p++) {
11085 assert((unsigned char)*p < 128);
11086 }
11087#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011088 if (PyUnicode_READY(unicode) == -1) {
11089 /* Memory error or bad data */
11090 PyErr_Clear();
11091 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11092 }
11093 if (!PyUnicode_IS_ASCII(unicode))
11094 return 0;
11095 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11096 return strlen(str) == len &&
11097 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11098}
11099
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011100int
11101_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11102{
11103 PyObject *right_uni;
11104 Py_hash_t hash;
11105
11106 assert(_PyUnicode_CHECK(left));
11107 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011108#ifndef NDEBUG
11109 for (const char *p = right->string; *p; p++) {
11110 assert((unsigned char)*p < 128);
11111 }
11112#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011113
11114 if (PyUnicode_READY(left) == -1) {
11115 /* memory error or bad data */
11116 PyErr_Clear();
11117 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11118 }
11119
11120 if (!PyUnicode_IS_ASCII(left))
11121 return 0;
11122
11123 right_uni = _PyUnicode_FromId(right); /* borrowed */
11124 if (right_uni == NULL) {
11125 /* memory error or bad data */
11126 PyErr_Clear();
11127 return _PyUnicode_EqualToASCIIString(left, right->string);
11128 }
11129
11130 if (left == right_uni)
11131 return 1;
11132
11133 if (PyUnicode_CHECK_INTERNED(left))
11134 return 0;
11135
11136 assert(_PyUnicode_HASH(right_uni) != 1);
11137 hash = _PyUnicode_HASH(left);
11138 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11139 return 0;
11140
11141 return unicode_compare_eq(left, right_uni);
11142}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011143
Benjamin Peterson29060642009-01-31 22:14:21 +000011144#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011145 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011146
Alexander Belopolsky40018472011-02-26 01:02:56 +000011147PyObject *
11148PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011149{
11150 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011151 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011152
Victor Stinnere5567ad2012-10-23 02:48:49 +020011153 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11154 Py_RETURN_NOTIMPLEMENTED;
11155
11156 if (PyUnicode_READY(left) == -1 ||
11157 PyUnicode_READY(right) == -1)
11158 return NULL;
11159
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011160 if (left == right) {
11161 switch (op) {
11162 case Py_EQ:
11163 case Py_LE:
11164 case Py_GE:
11165 /* a string is equal to itself */
11166 v = Py_True;
11167 break;
11168 case Py_NE:
11169 case Py_LT:
11170 case Py_GT:
11171 v = Py_False;
11172 break;
11173 default:
11174 PyErr_BadArgument();
11175 return NULL;
11176 }
11177 }
11178 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011179 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011180 result ^= (op == Py_NE);
11181 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011182 }
11183 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011184 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011185
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011186 /* Convert the return value to a Boolean */
11187 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011188 case Py_LE:
11189 v = TEST_COND(result <= 0);
11190 break;
11191 case Py_GE:
11192 v = TEST_COND(result >= 0);
11193 break;
11194 case Py_LT:
11195 v = TEST_COND(result == -1);
11196 break;
11197 case Py_GT:
11198 v = TEST_COND(result == 1);
11199 break;
11200 default:
11201 PyErr_BadArgument();
11202 return NULL;
11203 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011204 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011205 Py_INCREF(v);
11206 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011207}
11208
Alexander Belopolsky40018472011-02-26 01:02:56 +000011209int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011210_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211{
11212 return unicode_eq(aa, bb);
11213}
11214
11215int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011217{
Victor Stinner77282cb2013-04-14 19:22:47 +020011218 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 void *buf1, *buf2;
11220 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011221 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011222
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 "'in <string>' requires string as left operand, not %.100s",
11226 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 if (ensure_unicode(str) < 0)
11232 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 kind2 = PyUnicode_KIND(substr);
11236 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011239 len2 = PyUnicode_GET_LENGTH(substr);
11240 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011242 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 if (len2 == 1) {
11245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011247 return result;
11248 }
11249 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 buf2 = _PyUnicode_AsKind(substr, kind1);
11251 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011252 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254
Victor Stinner77282cb2013-04-14 19:22:47 +020011255 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 case PyUnicode_1BYTE_KIND:
11257 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_2BYTE_KIND:
11260 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 case PyUnicode_4BYTE_KIND:
11263 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264 break;
11265 default:
11266 result = -1;
11267 assert(0);
11268 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269
Victor Stinner77282cb2013-04-14 19:22:47 +020011270 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 PyMem_Free(buf2);
11272
Guido van Rossum403d68b2000-03-13 15:55:09 +000011273 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011274}
11275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276/* Concat to string or Unicode object giving a new Unicode object. */
11277
Alexander Belopolsky40018472011-02-26 01:02:56 +000011278PyObject *
11279PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011281 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011282 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011285 if (ensure_unicode(left) < 0)
11286 return NULL;
11287
11288 if (!PyUnicode_Check(right)) {
11289 PyErr_Format(PyExc_TypeError,
11290 "can only concatenate str (not \"%.200s\") to str",
11291 right->ob_type->tp_name);
11292 return NULL;
11293 }
11294 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
11297 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 if (left == unicode_empty)
11299 return PyUnicode_FromObject(right);
11300 if (right == unicode_empty)
11301 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 left_len = PyUnicode_GET_LENGTH(left);
11304 right_len = PyUnicode_GET_LENGTH(right);
11305 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011306 PyErr_SetString(PyExc_OverflowError,
11307 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011309 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011311
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011312 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11313 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011314 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011317 result = PyUnicode_New(new_len, maxchar);
11318 if (result == NULL)
11319 return NULL;
11320 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11321 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11322 assert(_PyUnicode_CheckConsistency(result, 1));
11323 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324}
11325
Walter Dörwald1ab83302007-05-18 17:15:44 +000011326void
Victor Stinner23e56682011-10-03 03:54:37 +020011327PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011328{
Victor Stinner23e56682011-10-03 03:54:37 +020011329 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011330 Py_UCS4 maxchar, maxchar2;
11331 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011332
11333 if (p_left == NULL) {
11334 if (!PyErr_Occurred())
11335 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011336 return;
11337 }
Victor Stinner23e56682011-10-03 03:54:37 +020011338 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011339 if (right == NULL || left == NULL
11340 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011341 if (!PyErr_Occurred())
11342 PyErr_BadInternalCall();
11343 goto error;
11344 }
11345
Benjamin Petersonbac79492012-01-14 13:34:47 -050011346 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011347 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011348 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011349 goto error;
11350
Victor Stinner488fa492011-12-12 00:01:39 +010011351 /* Shortcuts */
11352 if (left == unicode_empty) {
11353 Py_DECREF(left);
11354 Py_INCREF(right);
11355 *p_left = right;
11356 return;
11357 }
11358 if (right == unicode_empty)
11359 return;
11360
11361 left_len = PyUnicode_GET_LENGTH(left);
11362 right_len = PyUnicode_GET_LENGTH(right);
11363 if (left_len > PY_SSIZE_T_MAX - right_len) {
11364 PyErr_SetString(PyExc_OverflowError,
11365 "strings are too large to concat");
11366 goto error;
11367 }
11368 new_len = left_len + right_len;
11369
11370 if (unicode_modifiable(left)
11371 && PyUnicode_CheckExact(right)
11372 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011373 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11374 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011375 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011376 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011377 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11378 {
11379 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011380 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011381 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011382
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011383 /* copy 'right' into the newly allocated area of 'left' */
11384 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011385 }
Victor Stinner488fa492011-12-12 00:01:39 +010011386 else {
11387 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11388 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011389 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011390
Victor Stinner488fa492011-12-12 00:01:39 +010011391 /* Concat the two Unicode strings */
11392 res = PyUnicode_New(new_len, maxchar);
11393 if (res == NULL)
11394 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011395 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11396 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011397 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011398 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011399 }
11400 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011401 return;
11402
11403error:
Victor Stinner488fa492011-12-12 00:01:39 +010011404 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011405}
11406
11407void
11408PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011410 PyUnicode_Append(pleft, right);
11411 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011412}
11413
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011414/*
11415Wraps stringlib_parse_args_finds() and additionally ensures that the
11416first argument is a unicode object.
11417*/
11418
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011419static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011420parse_args_finds_unicode(const char * function_name, PyObject *args,
11421 PyObject **substring,
11422 Py_ssize_t *start, Py_ssize_t *end)
11423{
11424 if(stringlib_parse_args_finds(function_name, args, substring,
11425 start, end)) {
11426 if (ensure_unicode(*substring) < 0)
11427 return 0;
11428 return 1;
11429 }
11430 return 0;
11431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011436Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011437string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011441unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011443 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011444 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011445 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011447 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 void *buf1, *buf2;
11449 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011451 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 kind1 = PyUnicode_KIND(self);
11455 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011456 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011457 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 len1 = PyUnicode_GET_LENGTH(self);
11460 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011462 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011463 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011464
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011465 buf1 = PyUnicode_DATA(self);
11466 buf2 = PyUnicode_DATA(substring);
11467 if (kind2 != kind1) {
11468 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011469 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011470 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011471 }
11472 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 case PyUnicode_1BYTE_KIND:
11474 iresult = ucs1lib_count(
11475 ((Py_UCS1*)buf1) + start, end - start,
11476 buf2, len2, PY_SSIZE_T_MAX
11477 );
11478 break;
11479 case PyUnicode_2BYTE_KIND:
11480 iresult = ucs2lib_count(
11481 ((Py_UCS2*)buf1) + start, end - start,
11482 buf2, len2, PY_SSIZE_T_MAX
11483 );
11484 break;
11485 case PyUnicode_4BYTE_KIND:
11486 iresult = ucs4lib_count(
11487 ((Py_UCS4*)buf1) + start, end - start,
11488 buf2, len2, PY_SSIZE_T_MAX
11489 );
11490 break;
11491 default:
11492 assert(0); iresult = 0;
11493 }
11494
11495 result = PyLong_FromSsize_t(iresult);
11496
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011497 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 return result;
11501}
11502
INADA Naoki3ae20562017-01-16 20:41:20 +090011503/*[clinic input]
11504str.encode as unicode_encode
11505
11506 encoding: str(c_default="NULL") = 'utf-8'
11507 The encoding in which to encode the string.
11508 errors: str(c_default="NULL") = 'strict'
11509 The error handling scheme to use for encoding errors.
11510 The default is 'strict' meaning that encoding errors raise a
11511 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11512 'xmlcharrefreplace' as well as any other name registered with
11513 codecs.register_error that can handle UnicodeEncodeErrors.
11514
11515Encode the string using the codec registered for encoding.
11516[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
11518static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011519unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011520/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011522 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011523}
11524
INADA Naoki3ae20562017-01-16 20:41:20 +090011525/*[clinic input]
11526str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
INADA Naoki3ae20562017-01-16 20:41:20 +090011528 tabsize: int = 8
11529
11530Return a copy where all tab characters are expanded using spaces.
11531
11532If tabsize is not given, a tab size of 8 characters is assumed.
11533[clinic start generated code]*/
11534
11535static PyObject *
11536unicode_expandtabs_impl(PyObject *self, int tabsize)
11537/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 Py_ssize_t i, j, line_pos, src_len, incr;
11540 Py_UCS4 ch;
11541 PyObject *u;
11542 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011543 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011544 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Antoine Pitrou22425222011-10-04 19:10:51 +020011546 if (PyUnicode_READY(self) == -1)
11547 return NULL;
11548
Thomas Wouters7e474022000-07-16 12:04:32 +000011549 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011550 src_len = PyUnicode_GET_LENGTH(self);
11551 i = j = line_pos = 0;
11552 kind = PyUnicode_KIND(self);
11553 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011554 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011555 for (; i < src_len; i++) {
11556 ch = PyUnicode_READ(kind, src_data, i);
11557 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011558 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011562 goto overflow;
11563 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011565 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 goto overflow;
11570 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 if (ch == '\n' || ch == '\r')
11573 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011575 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011576 if (!found)
11577 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011578
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 if (!u)
11582 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
Antoine Pitroue71d5742011-10-04 15:55:09 +020011585 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 for (; i < src_len; i++) {
11588 ch = PyUnicode_READ(kind, src_data, i);
11589 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 incr = tabsize - (line_pos % tabsize);
11592 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011593 FILL(kind, dest_data, ' ', j, incr);
11594 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011598 line_pos++;
11599 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 if (ch == '\n' || ch == '\r')
11602 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 }
11605 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011606 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011607
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011609 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615\n\
11616Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011617such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618arguments start and end are interpreted as in slice notation.\n\
11619\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
11622static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011625 /* initialize variables to prevent gcc warning */
11626 PyObject *substring = NULL;
11627 Py_ssize_t start = 0;
11628 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011631 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011634 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011637 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (result == -2)
11640 return NULL;
11641
Christian Heimes217cfd12007-12-02 14:31:20 +000011642 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643}
11644
11645static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011646unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011648 void *data;
11649 enum PyUnicode_Kind kind;
11650 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011651
11652 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11653 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011655 }
11656 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11657 PyErr_SetString(PyExc_IndexError, "string index out of range");
11658 return NULL;
11659 }
11660 kind = PyUnicode_KIND(self);
11661 data = PyUnicode_DATA(self);
11662 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011663 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664}
11665
Guido van Rossumc2504932007-09-18 19:42:40 +000011666/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011667 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011668static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011669unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670{
Guido van Rossumc2504932007-09-18 19:42:40 +000011671 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011672 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011673
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011674#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011675 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011676#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (_PyUnicode_HASH(self) != -1)
11678 return _PyUnicode_HASH(self);
11679 if (PyUnicode_READY(self) == -1)
11680 return -1;
11681 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011682 /*
11683 We make the hash of the empty string be 0, rather than using
11684 (prefix ^ suffix), since this slightly obfuscates the hash secret
11685 */
11686 if (len == 0) {
11687 _PyUnicode_HASH(self) = 0;
11688 return 0;
11689 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011690 x = _Py_HashBytes(PyUnicode_DATA(self),
11691 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011693 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
11701static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011704 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011705 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011706 PyObject *substring = NULL;
11707 Py_ssize_t start = 0;
11708 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011710 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011713 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011716 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 if (result == -2)
11719 return NULL;
11720
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 if (result < 0) {
11722 PyErr_SetString(PyExc_ValueError, "substring not found");
11723 return NULL;
11724 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011725
Christian Heimes217cfd12007-12-02 14:31:20 +000011726 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727}
11728
INADA Naoki3ae20562017-01-16 20:41:20 +090011729/*[clinic input]
11730str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
INADA Naoki3ae20562017-01-16 20:41:20 +090011732Return True if the string is a lowercase string, False otherwise.
11733
11734A string is lowercase if all cased characters in the string are lowercase and
11735there is at least one cased character in the string.
11736[clinic start generated code]*/
11737
11738static PyObject *
11739unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011740/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 Py_ssize_t i, length;
11743 int kind;
11744 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 int cased;
11746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (PyUnicode_READY(self) == -1)
11748 return NULL;
11749 length = PyUnicode_GET_LENGTH(self);
11750 kind = PyUnicode_KIND(self);
11751 data = PyUnicode_DATA(self);
11752
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (length == 1)
11755 return PyBool_FromLong(
11756 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011758 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011760 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011761
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 for (i = 0; i < length; i++) {
11764 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011765
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011767 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 else if (!cased && Py_UNICODE_ISLOWER(ch))
11769 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011771 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772}
11773
INADA Naoki3ae20562017-01-16 20:41:20 +090011774/*[clinic input]
11775str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
INADA Naoki3ae20562017-01-16 20:41:20 +090011777Return True if the string is an uppercase string, False otherwise.
11778
11779A string is uppercase if all cased characters in the string are uppercase and
11780there is at least one cased character in the string.
11781[clinic start generated code]*/
11782
11783static PyObject *
11784unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011785/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 Py_ssize_t i, length;
11788 int kind;
11789 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 int cased;
11791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (PyUnicode_READY(self) == -1)
11793 return NULL;
11794 length = PyUnicode_GET_LENGTH(self);
11795 kind = PyUnicode_KIND(self);
11796 data = PyUnicode_DATA(self);
11797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (length == 1)
11800 return PyBool_FromLong(
11801 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011803 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011805 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011806
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011810
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011812 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 else if (!cased && Py_UNICODE_ISUPPER(ch))
11814 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011816 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817}
11818
INADA Naoki3ae20562017-01-16 20:41:20 +090011819/*[clinic input]
11820str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
INADA Naoki3ae20562017-01-16 20:41:20 +090011822Return True if the string is a title-cased string, False otherwise.
11823
11824In a title-cased string, upper- and title-case characters may only
11825follow uncased characters and lowercase characters only cased ones.
11826[clinic start generated code]*/
11827
11828static PyObject *
11829unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011830/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 Py_ssize_t i, length;
11833 int kind;
11834 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 int cased, previous_is_cased;
11836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_READY(self) == -1)
11838 return NULL;
11839 length = PyUnicode_GET_LENGTH(self);
11840 kind = PyUnicode_KIND(self);
11841 data = PyUnicode_DATA(self);
11842
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (length == 1) {
11845 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11846 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11847 (Py_UNICODE_ISUPPER(ch) != 0));
11848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011850 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011852 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011853
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 cased = 0;
11855 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 for (i = 0; i < length; i++) {
11857 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011858
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11860 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011861 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 previous_is_cased = 1;
11863 cased = 1;
11864 }
11865 else if (Py_UNICODE_ISLOWER(ch)) {
11866 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011867 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 previous_is_cased = 1;
11869 cased = 1;
11870 }
11871 else
11872 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011874 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
INADA Naoki3ae20562017-01-16 20:41:20 +090011877/*[clinic input]
11878str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
INADA Naoki3ae20562017-01-16 20:41:20 +090011880Return True if the string is a whitespace string, False otherwise.
11881
11882A string is whitespace if all characters in the string are whitespace and there
11883is at least one character in the string.
11884[clinic start generated code]*/
11885
11886static PyObject *
11887unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011888/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 Py_ssize_t i, length;
11891 int kind;
11892 void *data;
11893
11894 if (PyUnicode_READY(self) == -1)
11895 return NULL;
11896 length = PyUnicode_GET_LENGTH(self);
11897 kind = PyUnicode_KIND(self);
11898 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 if (length == 1)
11902 return PyBool_FromLong(
11903 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011905 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011907 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 for (i = 0; i < length; i++) {
11910 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011911 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011912 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011914 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
INADA Naoki3ae20562017-01-16 20:41:20 +090011917/*[clinic input]
11918str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011919
INADA Naoki3ae20562017-01-16 20:41:20 +090011920Return True if the string is an alphabetic string, False otherwise.
11921
11922A string is alphabetic if all characters in the string are alphabetic and there
11923is at least one character in the string.
11924[clinic start generated code]*/
11925
11926static PyObject *
11927unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011928/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 Py_ssize_t i, length;
11931 int kind;
11932 void *data;
11933
11934 if (PyUnicode_READY(self) == -1)
11935 return NULL;
11936 length = PyUnicode_GET_LENGTH(self);
11937 kind = PyUnicode_KIND(self);
11938 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 if (length == 1)
11942 return PyBool_FromLong(
11943 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944
11945 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011947 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 for (i = 0; i < length; i++) {
11950 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011951 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011953 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954}
11955
INADA Naoki3ae20562017-01-16 20:41:20 +090011956/*[clinic input]
11957str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011958
INADA Naoki3ae20562017-01-16 20:41:20 +090011959Return True if the string is an alpha-numeric string, False otherwise.
11960
11961A string is alpha-numeric if all characters in the string are alpha-numeric and
11962there is at least one character in the string.
11963[clinic start generated code]*/
11964
11965static PyObject *
11966unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011967/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 int kind;
11970 void *data;
11971 Py_ssize_t len, i;
11972
11973 if (PyUnicode_READY(self) == -1)
11974 return NULL;
11975
11976 kind = PyUnicode_KIND(self);
11977 data = PyUnicode_DATA(self);
11978 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011979
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (len == 1) {
11982 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11983 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11984 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985
11986 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011988 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 for (i = 0; i < len; i++) {
11991 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011992 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011993 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011995 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996}
11997
INADA Naoki3ae20562017-01-16 20:41:20 +090011998/*[clinic input]
11999str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
INADA Naoki3ae20562017-01-16 20:41:20 +090012001Return True if the string is a decimal string, False otherwise.
12002
12003A string is a decimal string if all characters in the string are decimal and
12004there is at least one character in the string.
12005[clinic start generated code]*/
12006
12007static PyObject *
12008unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012009/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 Py_ssize_t i, length;
12012 int kind;
12013 void *data;
12014
12015 if (PyUnicode_READY(self) == -1)
12016 return NULL;
12017 length = PyUnicode_GET_LENGTH(self);
12018 kind = PyUnicode_KIND(self);
12019 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (length == 1)
12023 return PyBool_FromLong(
12024 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012026 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 for (i = 0; i < length; i++) {
12031 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012032 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012034 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035}
12036
INADA Naoki3ae20562017-01-16 20:41:20 +090012037/*[clinic input]
12038str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039
INADA Naoki3ae20562017-01-16 20:41:20 +090012040Return True if the string is a digit string, False otherwise.
12041
12042A string is a digit string if all characters in the string are digits and there
12043is at least one character in the string.
12044[clinic start generated code]*/
12045
12046static PyObject *
12047unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012048/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 Py_ssize_t i, length;
12051 int kind;
12052 void *data;
12053
12054 if (PyUnicode_READY(self) == -1)
12055 return NULL;
12056 length = PyUnicode_GET_LENGTH(self);
12057 kind = PyUnicode_KIND(self);
12058 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 if (length == 1) {
12062 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12063 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012066 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012068 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 for (i = 0; i < length; i++) {
12071 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012072 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012074 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075}
12076
INADA Naoki3ae20562017-01-16 20:41:20 +090012077/*[clinic input]
12078str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
INADA Naoki3ae20562017-01-16 20:41:20 +090012080Return True if the string is a numeric string, False otherwise.
12081
12082A string is numeric if all characters in the string are numeric and there is at
12083least one character in the string.
12084[clinic start generated code]*/
12085
12086static PyObject *
12087unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012088/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 Py_ssize_t i, length;
12091 int kind;
12092 void *data;
12093
12094 if (PyUnicode_READY(self) == -1)
12095 return NULL;
12096 length = PyUnicode_GET_LENGTH(self);
12097 kind = PyUnicode_KIND(self);
12098 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if (length == 1)
12102 return PyBool_FromLong(
12103 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012105 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012107 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 for (i = 0; i < length; i++) {
12110 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012113 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114}
12115
Martin v. Löwis47383402007-08-15 07:32:56 +000012116int
12117PyUnicode_IsIdentifier(PyObject *self)
12118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 int kind;
12120 void *data;
12121 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012122 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (PyUnicode_READY(self) == -1) {
12125 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 }
12128
12129 /* Special case for empty strings */
12130 if (PyUnicode_GET_LENGTH(self) == 0)
12131 return 0;
12132 kind = PyUnicode_KIND(self);
12133 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012134
12135 /* PEP 3131 says that the first character must be in
12136 XID_Start and subsequent characters in XID_Continue,
12137 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012139 letters, digits, underscore). However, given the current
12140 definition of XID_Start and XID_Continue, it is sufficient
12141 to check just for these, except that _ must be allowed
12142 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012144 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012145 return 0;
12146
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012147 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012150 return 1;
12151}
12152
INADA Naoki3ae20562017-01-16 20:41:20 +090012153/*[clinic input]
12154str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012155
INADA Naoki3ae20562017-01-16 20:41:20 +090012156Return True if the string is a valid Python identifier, False otherwise.
12157
12158Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12159"class".
12160[clinic start generated code]*/
12161
12162static PyObject *
12163unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012164/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012165{
12166 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12167}
12168
INADA Naoki3ae20562017-01-16 20:41:20 +090012169/*[clinic input]
12170str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012171
INADA Naoki3ae20562017-01-16 20:41:20 +090012172Return True if the string is printable, False otherwise.
12173
12174A string is printable if all of its characters are considered printable in
12175repr() or if it is empty.
12176[clinic start generated code]*/
12177
12178static PyObject *
12179unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012180/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 Py_ssize_t i, length;
12183 int kind;
12184 void *data;
12185
12186 if (PyUnicode_READY(self) == -1)
12187 return NULL;
12188 length = PyUnicode_GET_LENGTH(self);
12189 kind = PyUnicode_KIND(self);
12190 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012191
12192 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 if (length == 1)
12194 return PyBool_FromLong(
12195 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 for (i = 0; i < length; i++) {
12198 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012199 Py_RETURN_FALSE;
12200 }
12201 }
12202 Py_RETURN_TRUE;
12203}
12204
INADA Naoki3ae20562017-01-16 20:41:20 +090012205/*[clinic input]
12206str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
INADA Naoki3ae20562017-01-16 20:41:20 +090012208 iterable: object
12209 /
12210
12211Concatenate any number of strings.
12212
Martin Panter91a88662017-01-24 00:30:06 +000012213The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012214The result is returned as a new string.
12215
12216Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12217[clinic start generated code]*/
12218
12219static PyObject *
12220unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012221/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222{
INADA Naoki3ae20562017-01-16 20:41:20 +090012223 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224}
12225
Martin v. Löwis18e16552006-02-15 17:27:45 +000012226static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012227unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (PyUnicode_READY(self) == -1)
12230 return -1;
12231 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232}
12233
INADA Naoki3ae20562017-01-16 20:41:20 +090012234/*[clinic input]
12235str.ljust as unicode_ljust
12236
12237 width: Py_ssize_t
12238 fillchar: Py_UCS4 = ' '
12239 /
12240
12241Return a left-justified string of length width.
12242
12243Padding is done using the specified fill character (default is a space).
12244[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245
12246static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012247unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12248/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012250 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
Victor Stinnerc4b49542011-12-11 22:44:26 +010012253 if (PyUnicode_GET_LENGTH(self) >= width)
12254 return unicode_result_unchanged(self);
12255
12256 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257}
12258
INADA Naoki3ae20562017-01-16 20:41:20 +090012259/*[clinic input]
12260str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
INADA Naoki3ae20562017-01-16 20:41:20 +090012262Return a copy of the string converted to lowercase.
12263[clinic start generated code]*/
12264
12265static PyObject *
12266unicode_lower_impl(PyObject *self)
12267/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012269 if (PyUnicode_READY(self) == -1)
12270 return NULL;
12271 if (PyUnicode_IS_ASCII(self))
12272 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012273 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274}
12275
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012276#define LEFTSTRIP 0
12277#define RIGHTSTRIP 1
12278#define BOTHSTRIP 2
12279
12280/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012281static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012282
INADA Naoki3ae20562017-01-16 20:41:20 +090012283#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285/* externally visible for str.strip(unicode) */
12286PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012287_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 void *data;
12290 int kind;
12291 Py_ssize_t i, j, len;
12292 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012293 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12296 return NULL;
12297
12298 kind = PyUnicode_KIND(self);
12299 data = PyUnicode_DATA(self);
12300 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012301 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12303 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012304 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012305
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 i = 0;
12307 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012308 while (i < len) {
12309 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12310 if (!BLOOM(sepmask, ch))
12311 break;
12312 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12313 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 i++;
12315 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012316 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 j = len;
12319 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012320 j--;
12321 while (j >= i) {
12322 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12323 if (!BLOOM(sepmask, ch))
12324 break;
12325 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12326 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012328 }
12329
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332
Victor Stinner7931d9a2011-11-04 00:22:48 +010012333 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334}
12335
12336PyObject*
12337PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12338{
12339 unsigned char *data;
12340 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012341 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342
Victor Stinnerde636f32011-10-01 03:55:54 +020012343 if (PyUnicode_READY(self) == -1)
12344 return NULL;
12345
Victor Stinner684d5fd2012-05-03 02:32:34 +020012346 length = PyUnicode_GET_LENGTH(self);
12347 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012348
Victor Stinner684d5fd2012-05-03 02:32:34 +020012349 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012350 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351
Victor Stinnerde636f32011-10-01 03:55:54 +020012352 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012353 PyErr_SetString(PyExc_IndexError, "string index out of range");
12354 return NULL;
12355 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012356 if (start >= length || end < start)
12357 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012358
Victor Stinner684d5fd2012-05-03 02:32:34 +020012359 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012360 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012361 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012362 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012363 }
12364 else {
12365 kind = PyUnicode_KIND(self);
12366 data = PyUnicode_1BYTE_DATA(self);
12367 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012368 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012369 length);
12370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372
12373static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012374do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 Py_ssize_t len, i, j;
12377
12378 if (PyUnicode_READY(self) == -1)
12379 return NULL;
12380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012382
Victor Stinnercc7af722013-04-09 22:39:24 +020012383 if (PyUnicode_IS_ASCII(self)) {
12384 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12385
12386 i = 0;
12387 if (striptype != RIGHTSTRIP) {
12388 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012389 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012390 if (!_Py_ascii_whitespace[ch])
12391 break;
12392 i++;
12393 }
12394 }
12395
12396 j = len;
12397 if (striptype != LEFTSTRIP) {
12398 j--;
12399 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012400 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012401 if (!_Py_ascii_whitespace[ch])
12402 break;
12403 j--;
12404 }
12405 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 }
12407 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012408 else {
12409 int kind = PyUnicode_KIND(self);
12410 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012411
Victor Stinnercc7af722013-04-09 22:39:24 +020012412 i = 0;
12413 if (striptype != RIGHTSTRIP) {
12414 while (i < len) {
12415 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12416 if (!Py_UNICODE_ISSPACE(ch))
12417 break;
12418 i++;
12419 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012420 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012421
12422 j = len;
12423 if (striptype != LEFTSTRIP) {
12424 j--;
12425 while (j >= i) {
12426 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12427 if (!Py_UNICODE_ISSPACE(ch))
12428 break;
12429 j--;
12430 }
12431 j++;
12432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012434
Victor Stinner7931d9a2011-11-04 00:22:48 +010012435 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436}
12437
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438
12439static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012440do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 if (sep != NULL && sep != Py_None) {
12443 if (PyUnicode_Check(sep))
12444 return _PyUnicode_XStrip(self, striptype, sep);
12445 else {
12446 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 "%s arg must be None or str",
12448 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449 return NULL;
12450 }
12451 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012452
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454}
12455
12456
INADA Naoki3ae20562017-01-16 20:41:20 +090012457/*[clinic input]
12458str.strip as unicode_strip
12459
12460 chars: object = None
12461 /
12462
Victor Stinner0c4a8282017-01-17 02:21:47 +010012463Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012464
12465If chars is given and not None, remove characters in chars instead.
12466[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467
12468static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012469unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012470/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012471{
INADA Naoki3ae20562017-01-16 20:41:20 +090012472 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473}
12474
12475
INADA Naoki3ae20562017-01-16 20:41:20 +090012476/*[clinic input]
12477str.lstrip as unicode_lstrip
12478
12479 chars: object = NULL
12480 /
12481
12482Return a copy of the string with leading whitespace removed.
12483
12484If chars is given and not None, remove characters in chars instead.
12485[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012486
12487static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012488unicode_lstrip_impl(PyObject *self, PyObject *chars)
12489/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490{
INADA Naoki3ae20562017-01-16 20:41:20 +090012491 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492}
12493
12494
INADA Naoki3ae20562017-01-16 20:41:20 +090012495/*[clinic input]
12496str.rstrip as unicode_rstrip
12497
12498 chars: object = NULL
12499 /
12500
12501Return a copy of the string with trailing whitespace removed.
12502
12503If chars is given and not None, remove characters in chars instead.
12504[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012505
12506static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012507unicode_rstrip_impl(PyObject *self, PyObject *chars)
12508/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509{
INADA Naoki3ae20562017-01-16 20:41:20 +090012510 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511}
12512
12513
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012515unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
Serhiy Storchaka05997252013-01-26 12:14:02 +020012520 if (len < 1)
12521 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
Victor Stinnerc4b49542011-12-11 22:44:26 +010012523 /* no repeat, return original string */
12524 if (len == 1)
12525 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012526
Benjamin Petersonbac79492012-01-14 13:34:47 -050012527 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 return NULL;
12529
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012530 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012531 PyErr_SetString(PyExc_OverflowError,
12532 "repeated string is too long");
12533 return NULL;
12534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012536
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012537 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538 if (!u)
12539 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012540 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 if (PyUnicode_GET_LENGTH(str) == 1) {
12543 const int kind = PyUnicode_KIND(str);
12544 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012545 if (kind == PyUnicode_1BYTE_KIND) {
12546 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012547 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012548 }
12549 else if (kind == PyUnicode_2BYTE_KIND) {
12550 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012551 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012552 ucs2[n] = fill_char;
12553 } else {
12554 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12555 assert(kind == PyUnicode_4BYTE_KIND);
12556 for (n = 0; n < len; ++n)
12557 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 }
12560 else {
12561 /* number of characters copied this far */
12562 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012563 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012565 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012569 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012570 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572 }
12573
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012574 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012575 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576}
12577
Alexander Belopolsky40018472011-02-26 01:02:56 +000012578PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012579PyUnicode_Replace(PyObject *str,
12580 PyObject *substr,
12581 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012582 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012584 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12585 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012586 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012587 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588}
12589
INADA Naoki3ae20562017-01-16 20:41:20 +090012590/*[clinic input]
12591str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592
INADA Naoki3ae20562017-01-16 20:41:20 +090012593 old: unicode
12594 new: unicode
12595 count: Py_ssize_t = -1
12596 Maximum number of occurrences to replace.
12597 -1 (the default value) means replace all occurrences.
12598 /
12599
12600Return a copy with all occurrences of substring old replaced by new.
12601
12602If the optional argument count is given, only the first count occurrences are
12603replaced.
12604[clinic start generated code]*/
12605
12606static PyObject *
12607unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12608 Py_ssize_t count)
12609/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012611 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012613 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
12615
Alexander Belopolsky40018472011-02-26 01:02:56 +000012616static PyObject *
12617unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012619 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 Py_ssize_t isize;
12621 Py_ssize_t osize, squote, dquote, i, o;
12622 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012623 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627 return NULL;
12628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 isize = PyUnicode_GET_LENGTH(unicode);
12630 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 /* Compute length of output, quote characters, and
12633 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012634 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 max = 127;
12636 squote = dquote = 0;
12637 ikind = PyUnicode_KIND(unicode);
12638 for (i = 0; i < isize; i++) {
12639 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012640 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012642 case '\'': squote++; break;
12643 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012645 incr = 2;
12646 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 default:
12648 /* Fast-path ASCII */
12649 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012650 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012652 ;
12653 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012656 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 if (osize > PY_SSIZE_T_MAX - incr) {
12663 PyErr_SetString(PyExc_OverflowError,
12664 "string is too long to generate repr");
12665 return NULL;
12666 }
12667 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 }
12669
12670 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012671 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012673 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 if (dquote)
12675 /* Both squote and dquote present. Use squote,
12676 and escape them */
12677 osize += squote;
12678 else
12679 quote = '"';
12680 }
Victor Stinner55c08782013-04-14 18:45:39 +020012681 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682
12683 repr = PyUnicode_New(osize, max);
12684 if (repr == NULL)
12685 return NULL;
12686 okind = PyUnicode_KIND(repr);
12687 odata = PyUnicode_DATA(repr);
12688
12689 PyUnicode_WRITE(okind, odata, 0, quote);
12690 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012691 if (unchanged) {
12692 _PyUnicode_FastCopyCharacters(repr, 1,
12693 unicode, 0,
12694 isize);
12695 }
12696 else {
12697 for (i = 0, o = 1; i < isize; i++) {
12698 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699
Victor Stinner55c08782013-04-14 18:45:39 +020012700 /* Escape quotes and backslashes */
12701 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012702 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012704 continue;
12705 }
12706
12707 /* Map special whitespace to '\t', \n', '\r' */
12708 if (ch == '\t') {
12709 PyUnicode_WRITE(okind, odata, o++, '\\');
12710 PyUnicode_WRITE(okind, odata, o++, 't');
12711 }
12712 else if (ch == '\n') {
12713 PyUnicode_WRITE(okind, odata, o++, '\\');
12714 PyUnicode_WRITE(okind, odata, o++, 'n');
12715 }
12716 else if (ch == '\r') {
12717 PyUnicode_WRITE(okind, odata, o++, '\\');
12718 PyUnicode_WRITE(okind, odata, o++, 'r');
12719 }
12720
12721 /* Map non-printable US ASCII to '\xhh' */
12722 else if (ch < ' ' || ch == 0x7F) {
12723 PyUnicode_WRITE(okind, odata, o++, '\\');
12724 PyUnicode_WRITE(okind, odata, o++, 'x');
12725 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12726 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12727 }
12728
12729 /* Copy ASCII characters as-is */
12730 else if (ch < 0x7F) {
12731 PyUnicode_WRITE(okind, odata, o++, ch);
12732 }
12733
12734 /* Non-ASCII characters */
12735 else {
12736 /* Map Unicode whitespace and control characters
12737 (categories Z* and C* except ASCII space)
12738 */
12739 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12740 PyUnicode_WRITE(okind, odata, o++, '\\');
12741 /* Map 8-bit characters to '\xhh' */
12742 if (ch <= 0xff) {
12743 PyUnicode_WRITE(okind, odata, o++, 'x');
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12746 }
12747 /* Map 16-bit characters to '\uxxxx' */
12748 else if (ch <= 0xffff) {
12749 PyUnicode_WRITE(okind, odata, o++, 'u');
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12754 }
12755 /* Map 21-bit characters to '\U00xxxxxx' */
12756 else {
12757 PyUnicode_WRITE(okind, odata, o++, 'U');
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12766 }
12767 }
12768 /* Copy characters as-is */
12769 else {
12770 PyUnicode_WRITE(okind, odata, o++, ch);
12771 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012772 }
12773 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012776 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012777 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778}
12779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012780PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782\n\
12783Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012784such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785arguments start and end are interpreted as in slice notation.\n\
12786\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012787Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788
12789static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012792 /* initialize variables to prevent gcc warning */
12793 PyObject *substring = NULL;
12794 Py_ssize_t start = 0;
12795 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012798 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012801 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012804 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 if (result == -2)
12807 return NULL;
12808
Christian Heimes217cfd12007-12-02 14:31:20 +000012809 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810}
12811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012812PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012815Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
12817static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012820 /* initialize variables to prevent gcc warning */
12821 PyObject *substring = NULL;
12822 Py_ssize_t start = 0;
12823 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012826 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012829 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012832 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 if (result == -2)
12835 return NULL;
12836
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837 if (result < 0) {
12838 PyErr_SetString(PyExc_ValueError, "substring not found");
12839 return NULL;
12840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841
Christian Heimes217cfd12007-12-02 14:31:20 +000012842 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843}
12844
INADA Naoki3ae20562017-01-16 20:41:20 +090012845/*[clinic input]
12846str.rjust as unicode_rjust
12847
12848 width: Py_ssize_t
12849 fillchar: Py_UCS4 = ' '
12850 /
12851
12852Return a right-justified string of length width.
12853
12854Padding is done using the specified fill character (default is a space).
12855[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
12857static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012858unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12859/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012861 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862 return NULL;
12863
Victor Stinnerc4b49542011-12-11 22:44:26 +010012864 if (PyUnicode_GET_LENGTH(self) >= width)
12865 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
Victor Stinnerc4b49542011-12-11 22:44:26 +010012867 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
Alexander Belopolsky40018472011-02-26 01:02:56 +000012870PyObject *
12871PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012873 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012876 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877}
12878
INADA Naoki3ae20562017-01-16 20:41:20 +090012879/*[clinic input]
12880str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
INADA Naoki3ae20562017-01-16 20:41:20 +090012882 sep: object = None
12883 The delimiter according which to split the string.
12884 None (the default value) means split according to any whitespace,
12885 and discard empty strings from the result.
12886 maxsplit: Py_ssize_t = -1
12887 Maximum number of splits to do.
12888 -1 (the default value) means no limit.
12889
12890Return a list of the words in the string, using sep as the delimiter string.
12891[clinic start generated code]*/
12892
12893static PyObject *
12894unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12895/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896{
INADA Naoki3ae20562017-01-16 20:41:20 +090012897 if (sep == Py_None)
12898 return split(self, NULL, maxsplit);
12899 if (PyUnicode_Check(sep))
12900 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012901
12902 PyErr_Format(PyExc_TypeError,
12903 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012904 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906}
12907
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012909PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012912 int kind1, kind2;
12913 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012915
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012916 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918
Victor Stinner14f8f022011-10-05 20:58:25 +020012919 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 len1 = PyUnicode_GET_LENGTH(str_obj);
12922 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012923 if (kind1 < kind2 || len1 < len2) {
12924 _Py_INCREF_UNICODE_EMPTY();
12925 if (!unicode_empty)
12926 out = NULL;
12927 else {
12928 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12929 Py_DECREF(unicode_empty);
12930 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012931 return out;
12932 }
12933 buf1 = PyUnicode_DATA(str_obj);
12934 buf2 = PyUnicode_DATA(sep_obj);
12935 if (kind2 != kind1) {
12936 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12937 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012938 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012941 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012943 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12944 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12945 else
12946 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 break;
12948 case PyUnicode_2BYTE_KIND:
12949 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12950 break;
12951 case PyUnicode_4BYTE_KIND:
12952 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12953 break;
12954 default:
12955 assert(0);
12956 out = 0;
12957 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012958
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012959 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012961
12962 return out;
12963}
12964
12965
12966PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012967PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012970 int kind1, kind2;
12971 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012974 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012977 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 len1 = PyUnicode_GET_LENGTH(str_obj);
12980 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012981 if (kind1 < kind2 || len1 < len2) {
12982 _Py_INCREF_UNICODE_EMPTY();
12983 if (!unicode_empty)
12984 out = NULL;
12985 else {
12986 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12987 Py_DECREF(unicode_empty);
12988 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 return out;
12990 }
12991 buf1 = PyUnicode_DATA(str_obj);
12992 buf2 = PyUnicode_DATA(sep_obj);
12993 if (kind2 != kind1) {
12994 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12995 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012996 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012999 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013001 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13002 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13003 else
13004 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 break;
13006 case PyUnicode_2BYTE_KIND:
13007 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13008 break;
13009 case PyUnicode_4BYTE_KIND:
13010 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13011 break;
13012 default:
13013 assert(0);
13014 out = 0;
13015 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013016
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013017 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013019
13020 return out;
13021}
13022
INADA Naoki3ae20562017-01-16 20:41:20 +090013023/*[clinic input]
13024str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013025
INADA Naoki3ae20562017-01-16 20:41:20 +090013026 sep: object
13027 /
13028
13029Partition the string into three parts using the given separator.
13030
13031This will search for the separator in the string. If the separator is found,
13032returns a 3-tuple containing the part before the separator, the separator
13033itself, and the part after it.
13034
13035If the separator is not found, returns a 3-tuple containing the original string
13036and two empty strings.
13037[clinic start generated code]*/
13038
13039static PyObject *
13040unicode_partition(PyObject *self, PyObject *sep)
13041/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013042{
INADA Naoki3ae20562017-01-16 20:41:20 +090013043 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044}
13045
INADA Naoki3ae20562017-01-16 20:41:20 +090013046/*[clinic input]
13047str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013048
INADA Naoki3ae20562017-01-16 20:41:20 +090013049Partition the string into three parts using the given separator.
13050
13051This will search for the separator in the string, starting and the end. If
13052the separator is found, returns a 3-tuple containing the part before the
13053separator, the separator itself, and the part after it.
13054
13055If the separator is not found, returns a 3-tuple containing two empty strings
13056and the original string.
13057[clinic start generated code]*/
13058
13059static PyObject *
13060unicode_rpartition(PyObject *self, PyObject *sep)
13061/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062{
INADA Naoki3ae20562017-01-16 20:41:20 +090013063 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013064}
13065
Alexander Belopolsky40018472011-02-26 01:02:56 +000013066PyObject *
13067PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013068{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013069 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013071
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013072 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013073}
13074
INADA Naoki3ae20562017-01-16 20:41:20 +090013075/*[clinic input]
13076str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013077
INADA Naoki3ae20562017-01-16 20:41:20 +090013078Return a list of the words in the string, using sep as the delimiter string.
13079
13080Splits are done starting at the end of the string and working to the front.
13081[clinic start generated code]*/
13082
13083static PyObject *
13084unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13085/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086{
INADA Naoki3ae20562017-01-16 20:41:20 +090013087 if (sep == Py_None)
13088 return rsplit(self, NULL, maxsplit);
13089 if (PyUnicode_Check(sep))
13090 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013091
13092 PyErr_Format(PyExc_TypeError,
13093 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013094 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013095 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096}
13097
INADA Naoki3ae20562017-01-16 20:41:20 +090013098/*[clinic input]
13099str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013101 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013102
13103Return a list of the lines in the string, breaking at line boundaries.
13104
13105Line breaks are not included in the resulting list unless keepends is given and
13106true.
13107[clinic start generated code]*/
13108
13109static PyObject *
13110unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013111/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013113 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114}
13115
13116static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013117PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013119 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120}
13121
INADA Naoki3ae20562017-01-16 20:41:20 +090013122/*[clinic input]
13123str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124
INADA Naoki3ae20562017-01-16 20:41:20 +090013125Convert uppercase characters to lowercase and lowercase characters to uppercase.
13126[clinic start generated code]*/
13127
13128static PyObject *
13129unicode_swapcase_impl(PyObject *self)
13130/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013132 if (PyUnicode_READY(self) == -1)
13133 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013134 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135}
13136
Larry Hastings61272b72014-01-07 12:41:53 -080013137/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013138
Larry Hastings31826802013-10-19 00:09:25 -070013139@staticmethod
13140str.maketrans as unicode_maketrans
13141
13142 x: object
13143
13144 y: unicode=NULL
13145
13146 z: unicode=NULL
13147
13148 /
13149
13150Return a translation table usable for str.translate().
13151
13152If there is only one argument, it must be a dictionary mapping Unicode
13153ordinals (integers) or characters to Unicode ordinals, strings or None.
13154Character keys will be then converted to ordinals.
13155If there are two arguments, they must be strings of equal length, and
13156in the resulting dictionary, each character in x will be mapped to the
13157character at the same position in y. If there is a third argument, it
13158must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013159[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013160
Larry Hastings31826802013-10-19 00:09:25 -070013161static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013162unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013163/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013164{
Georg Brandlceee0772007-11-27 23:48:05 +000013165 PyObject *new = NULL, *key, *value;
13166 Py_ssize_t i = 0;
13167 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168
Georg Brandlceee0772007-11-27 23:48:05 +000013169 new = PyDict_New();
13170 if (!new)
13171 return NULL;
13172 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 int x_kind, y_kind, z_kind;
13174 void *x_data, *y_data, *z_data;
13175
Georg Brandlceee0772007-11-27 23:48:05 +000013176 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013177 if (!PyUnicode_Check(x)) {
13178 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13179 "be a string if there is a second argument");
13180 goto err;
13181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013182 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013183 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13184 "arguments must have equal length");
13185 goto err;
13186 }
13187 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 x_kind = PyUnicode_KIND(x);
13189 y_kind = PyUnicode_KIND(y);
13190 x_data = PyUnicode_DATA(x);
13191 y_data = PyUnicode_DATA(y);
13192 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13193 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013194 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013195 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013196 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013197 if (!value) {
13198 Py_DECREF(key);
13199 goto err;
13200 }
Georg Brandlceee0772007-11-27 23:48:05 +000013201 res = PyDict_SetItem(new, key, value);
13202 Py_DECREF(key);
13203 Py_DECREF(value);
13204 if (res < 0)
13205 goto err;
13206 }
13207 /* create entries for deleting chars in z */
13208 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 z_kind = PyUnicode_KIND(z);
13210 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013211 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013213 if (!key)
13214 goto err;
13215 res = PyDict_SetItem(new, key, Py_None);
13216 Py_DECREF(key);
13217 if (res < 0)
13218 goto err;
13219 }
13220 }
13221 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 int kind;
13223 void *data;
13224
Georg Brandlceee0772007-11-27 23:48:05 +000013225 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013226 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013227 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13228 "to maketrans it must be a dict");
13229 goto err;
13230 }
13231 /* copy entries into the new dict, converting string keys to int keys */
13232 while (PyDict_Next(x, &i, &key, &value)) {
13233 if (PyUnicode_Check(key)) {
13234 /* convert string keys to integer keys */
13235 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013236 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013237 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13238 "table must be of length 1");
13239 goto err;
13240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 kind = PyUnicode_KIND(key);
13242 data = PyUnicode_DATA(key);
13243 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013244 if (!newkey)
13245 goto err;
13246 res = PyDict_SetItem(new, newkey, value);
13247 Py_DECREF(newkey);
13248 if (res < 0)
13249 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013250 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013251 /* just keep integer keys */
13252 if (PyDict_SetItem(new, key, value) < 0)
13253 goto err;
13254 } else {
13255 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13256 "be strings or integers");
13257 goto err;
13258 }
13259 }
13260 }
13261 return new;
13262 err:
13263 Py_DECREF(new);
13264 return NULL;
13265}
13266
INADA Naoki3ae20562017-01-16 20:41:20 +090013267/*[clinic input]
13268str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269
INADA Naoki3ae20562017-01-16 20:41:20 +090013270 table: object
13271 Translation table, which must be a mapping of Unicode ordinals to
13272 Unicode ordinals, strings, or None.
13273 /
13274
13275Replace each character in the string using the given translation table.
13276
13277The table must implement lookup/indexing via __getitem__, for instance a
13278dictionary or list. If this operation raises LookupError, the character is
13279left untouched. Characters mapped to None are deleted.
13280[clinic start generated code]*/
13281
13282static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013283unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013284/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287}
13288
INADA Naoki3ae20562017-01-16 20:41:20 +090013289/*[clinic input]
13290str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291
INADA Naoki3ae20562017-01-16 20:41:20 +090013292Return a copy of the string converted to uppercase.
13293[clinic start generated code]*/
13294
13295static PyObject *
13296unicode_upper_impl(PyObject *self)
13297/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013299 if (PyUnicode_READY(self) == -1)
13300 return NULL;
13301 if (PyUnicode_IS_ASCII(self))
13302 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013303 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
INADA Naoki3ae20562017-01-16 20:41:20 +090013306/*[clinic input]
13307str.zfill as unicode_zfill
13308
13309 width: Py_ssize_t
13310 /
13311
13312Pad a numeric string with zeros on the left, to fill a field of the given width.
13313
13314The string is never truncated.
13315[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
13317static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013318unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013319/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013321 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013322 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 int kind;
13324 void *data;
13325 Py_UCS4 chr;
13326
Benjamin Petersonbac79492012-01-14 13:34:47 -050013327 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329
Victor Stinnerc4b49542011-12-11 22:44:26 +010013330 if (PyUnicode_GET_LENGTH(self) >= width)
13331 return unicode_result_unchanged(self);
13332
13333 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334
13335 u = pad(self, fill, 0, '0');
13336
Walter Dörwald068325e2002-04-15 13:36:47 +000013337 if (u == NULL)
13338 return NULL;
13339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340 kind = PyUnicode_KIND(u);
13341 data = PyUnicode_DATA(u);
13342 chr = PyUnicode_READ(kind, data, fill);
13343
13344 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 PyUnicode_WRITE(kind, data, 0, chr);
13347 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348 }
13349
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013350 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013351 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
13354#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013355static PyObject *
13356unicode__decimal2ascii(PyObject *self)
13357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013359}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360#endif
13361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013362PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013363 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013365Return True if S starts with the specified prefix, False otherwise.\n\
13366With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367With optional end, stop comparing S at that position.\n\
13368prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369
13370static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013371unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013374 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013375 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013376 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013377 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379
Jesus Ceaac451502011-04-20 17:09:23 +020013380 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 if (PyTuple_Check(subobj)) {
13383 Py_ssize_t i;
13384 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013385 substring = PyTuple_GET_ITEM(subobj, i);
13386 if (!PyUnicode_Check(substring)) {
13387 PyErr_Format(PyExc_TypeError,
13388 "tuple for startswith must only contain str, "
13389 "not %.100s",
13390 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013391 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013392 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013394 if (result == -1)
13395 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013396 if (result) {
13397 Py_RETURN_TRUE;
13398 }
13399 }
13400 /* nothing matched */
13401 Py_RETURN_FALSE;
13402 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 if (!PyUnicode_Check(subobj)) {
13404 PyErr_Format(PyExc_TypeError,
13405 "startswith first arg must be str or "
13406 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013408 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013410 if (result == -1)
13411 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413}
13414
13415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013416PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013419Return True if S ends with the specified suffix, False otherwise.\n\
13420With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421With optional end, stop comparing S at that position.\n\
13422suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423
13424static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013425unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013428 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013429 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013430 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013431 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013432 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433
Jesus Ceaac451502011-04-20 17:09:23 +020013434 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 if (PyTuple_Check(subobj)) {
13437 Py_ssize_t i;
13438 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439 substring = PyTuple_GET_ITEM(subobj, i);
13440 if (!PyUnicode_Check(substring)) {
13441 PyErr_Format(PyExc_TypeError,
13442 "tuple for endswith must only contain str, "
13443 "not %.100s",
13444 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013446 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013448 if (result == -1)
13449 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013450 if (result) {
13451 Py_RETURN_TRUE;
13452 }
13453 }
13454 Py_RETURN_FALSE;
13455 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013456 if (!PyUnicode_Check(subobj)) {
13457 PyErr_Format(PyExc_TypeError,
13458 "endswith first arg must be str or "
13459 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013461 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013462 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013463 if (result == -1)
13464 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013465 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466}
13467
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013468static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013469_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013470{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013471 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13472 writer->data = PyUnicode_DATA(writer->buffer);
13473
13474 if (!writer->readonly) {
13475 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013476 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013477 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013478 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013479 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13480 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13481 writer->kind = PyUnicode_WCHAR_KIND;
13482 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13483
Victor Stinner8f674cc2013-04-17 23:02:17 +020013484 /* Copy-on-write mode: set buffer size to 0 so
13485 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13486 * next write. */
13487 writer->size = 0;
13488 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013489}
13490
Victor Stinnerd3f08822012-05-29 12:57:52 +020013491void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013492_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013493{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013495
13496 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013498
13499 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13500 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13501 writer->kind = PyUnicode_WCHAR_KIND;
13502 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013503}
13504
Victor Stinnerd3f08822012-05-29 12:57:52 +020013505int
13506_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13507 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013508{
13509 Py_ssize_t newlen;
13510 PyObject *newbuffer;
13511
Victor Stinner2740e462016-09-06 16:58:36 -070013512 assert(maxchar <= MAX_UNICODE);
13513
Victor Stinnerca9381e2015-09-22 00:58:32 +020013514 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013515 assert((maxchar > writer->maxchar && length >= 0)
13516 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517
Victor Stinner202fdca2012-05-07 12:47:02 +020013518 if (length > PY_SSIZE_T_MAX - writer->pos) {
13519 PyErr_NoMemory();
13520 return -1;
13521 }
13522 newlen = writer->pos + length;
13523
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013524 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013525
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013527 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013528 if (writer->overallocate
13529 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13530 /* overallocate to limit the number of realloc() */
13531 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013533 if (newlen < writer->min_length)
13534 newlen = writer->min_length;
13535
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 writer->buffer = PyUnicode_New(newlen, maxchar);
13537 if (writer->buffer == NULL)
13538 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013539 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013540 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013541 if (writer->overallocate
13542 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13543 /* overallocate to limit the number of realloc() */
13544 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013546 if (newlen < writer->min_length)
13547 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013548
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013549 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013551 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013552 newbuffer = PyUnicode_New(newlen, maxchar);
13553 if (newbuffer == NULL)
13554 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13556 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013557 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013558 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013559 }
13560 else {
13561 newbuffer = resize_compact(writer->buffer, newlen);
13562 if (newbuffer == NULL)
13563 return -1;
13564 }
13565 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013566 }
13567 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013568 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569 newbuffer = PyUnicode_New(writer->size, maxchar);
13570 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13573 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013574 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013576 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013577 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013578
13579#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013580}
13581
Victor Stinnerca9381e2015-09-22 00:58:32 +020013582int
13583_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13584 enum PyUnicode_Kind kind)
13585{
13586 Py_UCS4 maxchar;
13587
13588 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13589 assert(writer->kind < kind);
13590
13591 switch (kind)
13592 {
13593 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13594 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13595 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13596 default:
13597 assert(0 && "invalid kind");
13598 return -1;
13599 }
13600
13601 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13602}
13603
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013604static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013605_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013606{
Victor Stinner2740e462016-09-06 16:58:36 -070013607 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013608 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13609 return -1;
13610 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13611 writer->pos++;
13612 return 0;
13613}
13614
13615int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013616_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13617{
13618 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13619}
13620
13621int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013622_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13623{
13624 Py_UCS4 maxchar;
13625 Py_ssize_t len;
13626
13627 if (PyUnicode_READY(str) == -1)
13628 return -1;
13629 len = PyUnicode_GET_LENGTH(str);
13630 if (len == 0)
13631 return 0;
13632 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13633 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013634 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013635 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013636 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013637 Py_INCREF(str);
13638 writer->buffer = str;
13639 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 writer->pos += len;
13641 return 0;
13642 }
13643 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13644 return -1;
13645 }
13646 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13647 str, 0, len);
13648 writer->pos += len;
13649 return 0;
13650}
13651
Victor Stinnere215d962012-10-06 23:03:36 +020013652int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013653_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13654 Py_ssize_t start, Py_ssize_t end)
13655{
13656 Py_UCS4 maxchar;
13657 Py_ssize_t len;
13658
13659 if (PyUnicode_READY(str) == -1)
13660 return -1;
13661
13662 assert(0 <= start);
13663 assert(end <= PyUnicode_GET_LENGTH(str));
13664 assert(start <= end);
13665
13666 if (end == 0)
13667 return 0;
13668
13669 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13670 return _PyUnicodeWriter_WriteStr(writer, str);
13671
13672 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13673 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13674 else
13675 maxchar = writer->maxchar;
13676 len = end - start;
13677
13678 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13679 return -1;
13680
13681 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13682 str, start, len);
13683 writer->pos += len;
13684 return 0;
13685}
13686
13687int
Victor Stinner4a587072013-11-19 12:54:53 +010013688_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13689 const char *ascii, Py_ssize_t len)
13690{
13691 if (len == -1)
13692 len = strlen(ascii);
13693
13694 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13695
13696 if (writer->buffer == NULL && !writer->overallocate) {
13697 PyObject *str;
13698
13699 str = _PyUnicode_FromASCII(ascii, len);
13700 if (str == NULL)
13701 return -1;
13702
13703 writer->readonly = 1;
13704 writer->buffer = str;
13705 _PyUnicodeWriter_Update(writer);
13706 writer->pos += len;
13707 return 0;
13708 }
13709
13710 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13711 return -1;
13712
13713 switch (writer->kind)
13714 {
13715 case PyUnicode_1BYTE_KIND:
13716 {
13717 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13718 Py_UCS1 *data = writer->data;
13719
Christian Heimesf051e432016-09-13 20:22:02 +020013720 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013721 break;
13722 }
13723 case PyUnicode_2BYTE_KIND:
13724 {
13725 _PyUnicode_CONVERT_BYTES(
13726 Py_UCS1, Py_UCS2,
13727 ascii, ascii + len,
13728 (Py_UCS2 *)writer->data + writer->pos);
13729 break;
13730 }
13731 case PyUnicode_4BYTE_KIND:
13732 {
13733 _PyUnicode_CONVERT_BYTES(
13734 Py_UCS1, Py_UCS4,
13735 ascii, ascii + len,
13736 (Py_UCS4 *)writer->data + writer->pos);
13737 break;
13738 }
13739 default:
13740 assert(0);
13741 }
13742
13743 writer->pos += len;
13744 return 0;
13745}
13746
13747int
13748_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13749 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013750{
13751 Py_UCS4 maxchar;
13752
13753 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13754 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13755 return -1;
13756 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13757 writer->pos += len;
13758 return 0;
13759}
13760
Victor Stinnerd3f08822012-05-29 12:57:52 +020013761PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013762_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013763{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013764 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013765
Victor Stinnerd3f08822012-05-29 12:57:52 +020013766 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013767 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013768 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013769 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013770
13771 str = writer->buffer;
13772 writer->buffer = NULL;
13773
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013774 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013775 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13776 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013777 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013778
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013779 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13780 PyObject *str2;
13781 str2 = resize_compact(str, writer->pos);
13782 if (str2 == NULL) {
13783 Py_DECREF(str);
13784 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013785 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013786 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013787 }
13788
Victor Stinner15a0bd32013-07-08 22:29:55 +020013789 assert(_PyUnicode_CheckConsistency(str, 1));
13790 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013791}
13792
Victor Stinnerd3f08822012-05-29 12:57:52 +020013793void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013794_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013795{
13796 Py_CLEAR(writer->buffer);
13797}
13798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013800
13801PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013803\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013804Return a formatted version of S, using substitutions from args and kwargs.\n\
13805The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013806
Eric Smith27bbca62010-11-04 17:06:58 +000013807PyDoc_STRVAR(format_map__doc__,
13808 "S.format_map(mapping) -> str\n\
13809\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013810Return a formatted version of S, using substitutions from mapping.\n\
13811The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013812
INADA Naoki3ae20562017-01-16 20:41:20 +090013813/*[clinic input]
13814str.__format__ as unicode___format__
13815
13816 format_spec: unicode
13817 /
13818
13819Return a formatted version of the string as described by format_spec.
13820[clinic start generated code]*/
13821
Eric Smith4a7d76d2008-05-30 18:10:19 +000013822static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013823unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013824/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013825{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826 _PyUnicodeWriter writer;
13827 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013828
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829 if (PyUnicode_READY(self) == -1)
13830 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013831 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13833 self, format_spec, 0,
13834 PyUnicode_GET_LENGTH(format_spec));
13835 if (ret == -1) {
13836 _PyUnicodeWriter_Dealloc(&writer);
13837 return NULL;
13838 }
13839 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013840}
13841
INADA Naoki3ae20562017-01-16 20:41:20 +090013842/*[clinic input]
13843str.__sizeof__ as unicode_sizeof
13844
13845Return the size of the string in memory, in bytes.
13846[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013847
13848static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013849unicode_sizeof_impl(PyObject *self)
13850/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013851{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013852 Py_ssize_t size;
13853
13854 /* If it's a compact object, account for base structure +
13855 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013856 if (PyUnicode_IS_COMPACT_ASCII(self))
13857 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13858 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013860 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 else {
13862 /* If it is a two-block object, account for base object, and
13863 for character block if present. */
13864 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 if (_PyUnicode_DATA_ANY(self))
13866 size += (PyUnicode_GET_LENGTH(self) + 1) *
13867 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 }
13869 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013870 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013871 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13872 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13873 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13874 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875
13876 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013877}
13878
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013879static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013880unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013881{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013882 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013883 if (!copy)
13884 return NULL;
13885 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013886}
13887
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 UNICODE_ENCODE_METHODDEF
13890 UNICODE_REPLACE_METHODDEF
13891 UNICODE_SPLIT_METHODDEF
13892 UNICODE_RSPLIT_METHODDEF
13893 UNICODE_JOIN_METHODDEF
13894 UNICODE_CAPITALIZE_METHODDEF
13895 UNICODE_CASEFOLD_METHODDEF
13896 UNICODE_TITLE_METHODDEF
13897 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013898 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013899 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013900 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013901 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013902 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013903 UNICODE_LJUST_METHODDEF
13904 UNICODE_LOWER_METHODDEF
13905 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013906 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13907 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_RJUST_METHODDEF
13909 UNICODE_RSTRIP_METHODDEF
13910 UNICODE_RPARTITION_METHODDEF
13911 UNICODE_SPLITLINES_METHODDEF
13912 UNICODE_STRIP_METHODDEF
13913 UNICODE_SWAPCASE_METHODDEF
13914 UNICODE_TRANSLATE_METHODDEF
13915 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013916 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13917 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013918 UNICODE_ISLOWER_METHODDEF
13919 UNICODE_ISUPPER_METHODDEF
13920 UNICODE_ISTITLE_METHODDEF
13921 UNICODE_ISSPACE_METHODDEF
13922 UNICODE_ISDECIMAL_METHODDEF
13923 UNICODE_ISDIGIT_METHODDEF
13924 UNICODE_ISNUMERIC_METHODDEF
13925 UNICODE_ISALPHA_METHODDEF
13926 UNICODE_ISALNUM_METHODDEF
13927 UNICODE_ISIDENTIFIER_METHODDEF
13928 UNICODE_ISPRINTABLE_METHODDEF
13929 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013930 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013931 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013932 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013933 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013934 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013935#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013936 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013937 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938#endif
13939
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941 {NULL, NULL}
13942};
13943
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013944static PyObject *
13945unicode_mod(PyObject *v, PyObject *w)
13946{
Brian Curtindfc80e32011-08-10 20:28:54 -050013947 if (!PyUnicode_Check(v))
13948 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013950}
13951
13952static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 0, /*nb_add*/
13954 0, /*nb_subtract*/
13955 0, /*nb_multiply*/
13956 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013957};
13958
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 (lenfunc) unicode_length, /* sq_length */
13961 PyUnicode_Concat, /* sq_concat */
13962 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13963 (ssizeargfunc) unicode_getitem, /* sq_item */
13964 0, /* sq_slice */
13965 0, /* sq_ass_item */
13966 0, /* sq_ass_slice */
13967 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968};
13969
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013970static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013971unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013973 if (PyUnicode_READY(self) == -1)
13974 return NULL;
13975
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013976 if (PyIndex_Check(item)) {
13977 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013978 if (i == -1 && PyErr_Occurred())
13979 return NULL;
13980 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013981 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013982 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013984 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013985 PyObject *result;
13986 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013987 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013988 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013992 return NULL;
13993 }
13994
13995 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013996 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013997 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013998 slicelength == PyUnicode_GET_LENGTH(self)) {
13999 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014000 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014001 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014002 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014003 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014004 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014005 src_kind = PyUnicode_KIND(self);
14006 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014007 if (!PyUnicode_IS_ASCII(self)) {
14008 kind_limit = kind_maxchar_limit(src_kind);
14009 max_char = 0;
14010 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14011 ch = PyUnicode_READ(src_kind, src_data, cur);
14012 if (ch > max_char) {
14013 max_char = ch;
14014 if (max_char >= kind_limit)
14015 break;
14016 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014017 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014018 }
Victor Stinner55c99112011-10-13 01:17:06 +020014019 else
14020 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014021 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014022 if (result == NULL)
14023 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014025 dest_data = PyUnicode_DATA(result);
14026
14027 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014028 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14029 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014030 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014031 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014032 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014033 } else {
14034 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14035 return NULL;
14036 }
14037}
14038
14039static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 (lenfunc)unicode_length, /* mp_length */
14041 (binaryfunc)unicode_subscript, /* mp_subscript */
14042 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014043};
14044
Guido van Rossumd57fd912000-03-10 22:53:23 +000014045
Guido van Rossumd57fd912000-03-10 22:53:23 +000014046/* Helpers for PyUnicode_Format() */
14047
Victor Stinnera47082312012-10-04 02:19:54 +020014048struct unicode_formatter_t {
14049 PyObject *args;
14050 int args_owned;
14051 Py_ssize_t arglen, argidx;
14052 PyObject *dict;
14053
14054 enum PyUnicode_Kind fmtkind;
14055 Py_ssize_t fmtcnt, fmtpos;
14056 void *fmtdata;
14057 PyObject *fmtstr;
14058
14059 _PyUnicodeWriter writer;
14060};
14061
14062struct unicode_format_arg_t {
14063 Py_UCS4 ch;
14064 int flags;
14065 Py_ssize_t width;
14066 int prec;
14067 int sign;
14068};
14069
Guido van Rossumd57fd912000-03-10 22:53:23 +000014070static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014071unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072{
Victor Stinnera47082312012-10-04 02:19:54 +020014073 Py_ssize_t argidx = ctx->argidx;
14074
14075 if (argidx < ctx->arglen) {
14076 ctx->argidx++;
14077 if (ctx->arglen < 0)
14078 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014079 else
Victor Stinnera47082312012-10-04 02:19:54 +020014080 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081 }
14082 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014083 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084 return NULL;
14085}
14086
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014087/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014088
Victor Stinnera47082312012-10-04 02:19:54 +020014089/* Format a float into the writer if the writer is not NULL, or into *p_output
14090 otherwise.
14091
14092 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014093static int
Victor Stinnera47082312012-10-04 02:19:54 +020014094formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14095 PyObject **p_output,
14096 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014098 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014101 int prec;
14102 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014103
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104 x = PyFloat_AsDouble(v);
14105 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014106 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014107
Victor Stinnera47082312012-10-04 02:19:54 +020014108 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014109 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014111
Victor Stinnera47082312012-10-04 02:19:54 +020014112 if (arg->flags & F_ALT)
14113 dtoa_flags = Py_DTSF_ALT;
14114 else
14115 dtoa_flags = 0;
14116 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014117 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118 return -1;
14119 len = strlen(p);
14120 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014121 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014122 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014123 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014124 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125 }
14126 else
14127 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014128 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014129 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014130}
14131
Victor Stinnerd0880d52012-04-27 23:40:13 +020014132/* formatlong() emulates the format codes d, u, o, x and X, and
14133 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14134 * Python's regular ints.
14135 * Return value: a new PyUnicodeObject*, or NULL if error.
14136 * The output string is of the form
14137 * "-"? ("0x" | "0X")? digit+
14138 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14139 * set in flags. The case of hex digits will be correct,
14140 * There will be at least prec digits, zero-filled on the left if
14141 * necessary to get that many.
14142 * val object to be converted
14143 * flags bitmask of format flags; only F_ALT is looked at
14144 * prec minimum number of digits; 0-fill on left if needed
14145 * type a character in [duoxX]; u acts the same as d
14146 *
14147 * CAUTION: o, x and X conversions on regular ints can never
14148 * produce a '-' sign, but can for Python's unbounded ints.
14149 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014150PyObject *
14151_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014152{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014153 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014155 Py_ssize_t i;
14156 int sign; /* 1 if '-', else 0 */
14157 int len; /* number of characters */
14158 Py_ssize_t llen;
14159 int numdigits; /* len == numnondigits + numdigits */
14160 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014161
Victor Stinnerd0880d52012-04-27 23:40:13 +020014162 /* Avoid exceeding SSIZE_T_MAX */
14163 if (prec > INT_MAX-3) {
14164 PyErr_SetString(PyExc_OverflowError,
14165 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 }
14168
14169 assert(PyLong_Check(val));
14170
14171 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014172 default:
14173 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014174 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014175 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014177 /* int and int subclasses should print numerically when a numeric */
14178 /* format code is used (see issue18780) */
14179 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 break;
14181 case 'o':
14182 numnondigits = 2;
14183 result = PyNumber_ToBase(val, 8);
14184 break;
14185 case 'x':
14186 case 'X':
14187 numnondigits = 2;
14188 result = PyNumber_ToBase(val, 16);
14189 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 }
14191 if (!result)
14192 return NULL;
14193
14194 assert(unicode_modifiable(result));
14195 assert(PyUnicode_IS_READY(result));
14196 assert(PyUnicode_IS_ASCII(result));
14197
14198 /* To modify the string in-place, there can only be one reference. */
14199 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014200 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 PyErr_BadInternalCall();
14202 return NULL;
14203 }
14204 buf = PyUnicode_DATA(result);
14205 llen = PyUnicode_GET_LENGTH(result);
14206 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014207 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014208 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014209 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 return NULL;
14211 }
14212 len = (int)llen;
14213 sign = buf[0] == '-';
14214 numnondigits += sign;
14215 numdigits = len - numnondigits;
14216 assert(numdigits > 0);
14217
14218 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014219 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014220 (type == 'o' || type == 'x' || type == 'X'))) {
14221 assert(buf[sign] == '0');
14222 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14223 buf[sign+1] == 'o');
14224 numnondigits -= 2;
14225 buf += 2;
14226 len -= 2;
14227 if (sign)
14228 buf[0] = '-';
14229 assert(len == numnondigits + numdigits);
14230 assert(numdigits > 0);
14231 }
14232
14233 /* Fill with leading zeroes to meet minimum width. */
14234 if (prec > numdigits) {
14235 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14236 numnondigits + prec);
14237 char *b1;
14238 if (!r1) {
14239 Py_DECREF(result);
14240 return NULL;
14241 }
14242 b1 = PyBytes_AS_STRING(r1);
14243 for (i = 0; i < numnondigits; ++i)
14244 *b1++ = *buf++;
14245 for (i = 0; i < prec - numdigits; i++)
14246 *b1++ = '0';
14247 for (i = 0; i < numdigits; i++)
14248 *b1++ = *buf++;
14249 *b1 = '\0';
14250 Py_DECREF(result);
14251 result = r1;
14252 buf = PyBytes_AS_STRING(result);
14253 len = numnondigits + prec;
14254 }
14255
14256 /* Fix up case for hex conversions. */
14257 if (type == 'X') {
14258 /* Need to convert all lower case letters to upper case.
14259 and need to convert 0x to 0X (and -0x to -0X). */
14260 for (i = 0; i < len; i++)
14261 if (buf[i] >= 'a' && buf[i] <= 'x')
14262 buf[i] -= 'a'-'A';
14263 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 if (!PyUnicode_Check(result)
14265 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014266 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014267 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014268 Py_DECREF(result);
14269 result = unicode;
14270 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271 else if (len != PyUnicode_GET_LENGTH(result)) {
14272 if (PyUnicode_Resize(&result, len) < 0)
14273 Py_CLEAR(result);
14274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014276}
14277
Ethan Furmandf3ed242014-01-05 06:50:30 -080014278/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014280 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 * -1 and raise an exception on error */
14282static int
Victor Stinnera47082312012-10-04 02:19:54 +020014283mainformatlong(PyObject *v,
14284 struct unicode_format_arg_t *arg,
14285 PyObject **p_output,
14286 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287{
14288 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014289 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290
14291 if (!PyNumber_Check(v))
14292 goto wrongtype;
14293
Ethan Furman9ab74802014-03-21 06:38:46 -070014294 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014295 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014296 if (type == 'o' || type == 'x' || type == 'X') {
14297 iobj = PyNumber_Index(v);
14298 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014299 if (PyErr_ExceptionMatches(PyExc_TypeError))
14300 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014301 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014302 }
14303 }
14304 else {
14305 iobj = PyNumber_Long(v);
14306 if (iobj == NULL ) {
14307 if (PyErr_ExceptionMatches(PyExc_TypeError))
14308 goto wrongtype;
14309 return -1;
14310 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014311 }
14312 assert(PyLong_Check(iobj));
14313 }
14314 else {
14315 iobj = v;
14316 Py_INCREF(iobj);
14317 }
14318
14319 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014320 && arg->width == -1 && arg->prec == -1
14321 && !(arg->flags & (F_SIGN | F_BLANK))
14322 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014323 {
14324 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014325 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014326 int base;
14327
Victor Stinnera47082312012-10-04 02:19:54 +020014328 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014329 {
14330 default:
14331 assert(0 && "'type' not in [diuoxX]");
14332 case 'd':
14333 case 'i':
14334 case 'u':
14335 base = 10;
14336 break;
14337 case 'o':
14338 base = 8;
14339 break;
14340 case 'x':
14341 case 'X':
14342 base = 16;
14343 break;
14344 }
14345
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014346 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14347 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014349 }
14350 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 return 1;
14352 }
14353
Ethan Furmanb95b5612015-01-23 20:05:18 -080014354 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014355 Py_DECREF(iobj);
14356 if (res == NULL)
14357 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014358 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014359 return 0;
14360
14361wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014362 switch(type)
14363 {
14364 case 'o':
14365 case 'x':
14366 case 'X':
14367 PyErr_Format(PyExc_TypeError,
14368 "%%%c format: an integer is required, "
14369 "not %.200s",
14370 type, Py_TYPE(v)->tp_name);
14371 break;
14372 default:
14373 PyErr_Format(PyExc_TypeError,
14374 "%%%c format: a number is required, "
14375 "not %.200s",
14376 type, Py_TYPE(v)->tp_name);
14377 break;
14378 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 return -1;
14380}
14381
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014382static Py_UCS4
14383formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014384{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014385 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014386 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014387 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014388 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014389 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014390 goto onError;
14391 }
14392 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014394 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014395 /* make sure number is a type of integer */
14396 if (!PyLong_Check(v)) {
14397 iobj = PyNumber_Index(v);
14398 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014399 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014400 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014401 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014402 Py_DECREF(iobj);
14403 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014404 else {
14405 x = PyLong_AsLong(v);
14406 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 if (x == -1 && PyErr_Occurred())
14408 goto onError;
14409
Victor Stinner8faf8212011-12-08 22:14:11 +010014410 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014411 PyErr_SetString(PyExc_OverflowError,
14412 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014413 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 }
14415
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014416 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014418
Benjamin Peterson29060642009-01-31 22:14:21 +000014419 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014422 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014423}
14424
Victor Stinnera47082312012-10-04 02:19:54 +020014425/* Parse options of an argument: flags, width, precision.
14426 Handle also "%(name)" syntax.
14427
14428 Return 0 if the argument has been formatted into arg->str.
14429 Return 1 if the argument has been written into ctx->writer,
14430 Raise an exception and return -1 on error. */
14431static int
14432unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14433 struct unicode_format_arg_t *arg)
14434{
14435#define FORMAT_READ(ctx) \
14436 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14437
14438 PyObject *v;
14439
Victor Stinnera47082312012-10-04 02:19:54 +020014440 if (arg->ch == '(') {
14441 /* Get argument value from a dictionary. Example: "%(name)s". */
14442 Py_ssize_t keystart;
14443 Py_ssize_t keylen;
14444 PyObject *key;
14445 int pcount = 1;
14446
14447 if (ctx->dict == NULL) {
14448 PyErr_SetString(PyExc_TypeError,
14449 "format requires a mapping");
14450 return -1;
14451 }
14452 ++ctx->fmtpos;
14453 --ctx->fmtcnt;
14454 keystart = ctx->fmtpos;
14455 /* Skip over balanced parentheses */
14456 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14457 arg->ch = FORMAT_READ(ctx);
14458 if (arg->ch == ')')
14459 --pcount;
14460 else if (arg->ch == '(')
14461 ++pcount;
14462 ctx->fmtpos++;
14463 }
14464 keylen = ctx->fmtpos - keystart - 1;
14465 if (ctx->fmtcnt < 0 || pcount > 0) {
14466 PyErr_SetString(PyExc_ValueError,
14467 "incomplete format key");
14468 return -1;
14469 }
14470 key = PyUnicode_Substring(ctx->fmtstr,
14471 keystart, keystart + keylen);
14472 if (key == NULL)
14473 return -1;
14474 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014475 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014476 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014477 }
14478 ctx->args = PyObject_GetItem(ctx->dict, key);
14479 Py_DECREF(key);
14480 if (ctx->args == NULL)
14481 return -1;
14482 ctx->args_owned = 1;
14483 ctx->arglen = -1;
14484 ctx->argidx = -2;
14485 }
14486
14487 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014488 while (--ctx->fmtcnt >= 0) {
14489 arg->ch = FORMAT_READ(ctx);
14490 ctx->fmtpos++;
14491 switch (arg->ch) {
14492 case '-': arg->flags |= F_LJUST; continue;
14493 case '+': arg->flags |= F_SIGN; continue;
14494 case ' ': arg->flags |= F_BLANK; continue;
14495 case '#': arg->flags |= F_ALT; continue;
14496 case '0': arg->flags |= F_ZERO; continue;
14497 }
14498 break;
14499 }
14500
14501 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014502 if (arg->ch == '*') {
14503 v = unicode_format_getnextarg(ctx);
14504 if (v == NULL)
14505 return -1;
14506 if (!PyLong_Check(v)) {
14507 PyErr_SetString(PyExc_TypeError,
14508 "* wants int");
14509 return -1;
14510 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014511 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014512 if (arg->width == -1 && PyErr_Occurred())
14513 return -1;
14514 if (arg->width < 0) {
14515 arg->flags |= F_LJUST;
14516 arg->width = -arg->width;
14517 }
14518 if (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 }
14522 }
14523 else if (arg->ch >= '0' && arg->ch <= '9') {
14524 arg->width = arg->ch - '0';
14525 while (--ctx->fmtcnt >= 0) {
14526 arg->ch = FORMAT_READ(ctx);
14527 ctx->fmtpos++;
14528 if (arg->ch < '0' || arg->ch > '9')
14529 break;
14530 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14531 mixing signed and unsigned comparison. Since arg->ch is between
14532 '0' and '9', casting to int is safe. */
14533 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14534 PyErr_SetString(PyExc_ValueError,
14535 "width too big");
14536 return -1;
14537 }
14538 arg->width = arg->width*10 + (arg->ch - '0');
14539 }
14540 }
14541
14542 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014543 if (arg->ch == '.') {
14544 arg->prec = 0;
14545 if (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 }
14549 if (arg->ch == '*') {
14550 v = unicode_format_getnextarg(ctx);
14551 if (v == NULL)
14552 return -1;
14553 if (!PyLong_Check(v)) {
14554 PyErr_SetString(PyExc_TypeError,
14555 "* wants int");
14556 return -1;
14557 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014558 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014559 if (arg->prec == -1 && PyErr_Occurred())
14560 return -1;
14561 if (arg->prec < 0)
14562 arg->prec = 0;
14563 if (--ctx->fmtcnt >= 0) {
14564 arg->ch = FORMAT_READ(ctx);
14565 ctx->fmtpos++;
14566 }
14567 }
14568 else if (arg->ch >= '0' && arg->ch <= '9') {
14569 arg->prec = arg->ch - '0';
14570 while (--ctx->fmtcnt >= 0) {
14571 arg->ch = FORMAT_READ(ctx);
14572 ctx->fmtpos++;
14573 if (arg->ch < '0' || arg->ch > '9')
14574 break;
14575 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14576 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014577 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014578 return -1;
14579 }
14580 arg->prec = arg->prec*10 + (arg->ch - '0');
14581 }
14582 }
14583 }
14584
14585 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14586 if (ctx->fmtcnt >= 0) {
14587 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14588 if (--ctx->fmtcnt >= 0) {
14589 arg->ch = FORMAT_READ(ctx);
14590 ctx->fmtpos++;
14591 }
14592 }
14593 }
14594 if (ctx->fmtcnt < 0) {
14595 PyErr_SetString(PyExc_ValueError,
14596 "incomplete format");
14597 return -1;
14598 }
14599 return 0;
14600
14601#undef FORMAT_READ
14602}
14603
14604/* Format one argument. Supported conversion specifiers:
14605
14606 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014607 - "i", "d", "u": int or float
14608 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014609 - "e", "E", "f", "F", "g", "G": float
14610 - "c": int or str (1 character)
14611
Victor Stinner8dbd4212012-12-04 09:30:24 +010014612 When possible, the output is written directly into the Unicode writer
14613 (ctx->writer). A string is created when padding is required.
14614
Victor Stinnera47082312012-10-04 02:19:54 +020014615 Return 0 if the argument has been formatted into *p_str,
14616 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014617 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014618static int
14619unicode_format_arg_format(struct unicode_formatter_t *ctx,
14620 struct unicode_format_arg_t *arg,
14621 PyObject **p_str)
14622{
14623 PyObject *v;
14624 _PyUnicodeWriter *writer = &ctx->writer;
14625
14626 if (ctx->fmtcnt == 0)
14627 ctx->writer.overallocate = 0;
14628
Victor Stinnera47082312012-10-04 02:19:54 +020014629 v = unicode_format_getnextarg(ctx);
14630 if (v == NULL)
14631 return -1;
14632
Victor Stinnera47082312012-10-04 02:19:54 +020014633
14634 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014635 case 's':
14636 case 'r':
14637 case 'a':
14638 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14639 /* Fast path */
14640 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14641 return -1;
14642 return 1;
14643 }
14644
14645 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14646 *p_str = v;
14647 Py_INCREF(*p_str);
14648 }
14649 else {
14650 if (arg->ch == 's')
14651 *p_str = PyObject_Str(v);
14652 else if (arg->ch == 'r')
14653 *p_str = PyObject_Repr(v);
14654 else
14655 *p_str = PyObject_ASCII(v);
14656 }
14657 break;
14658
14659 case 'i':
14660 case 'd':
14661 case 'u':
14662 case 'o':
14663 case 'x':
14664 case 'X':
14665 {
14666 int ret = mainformatlong(v, arg, p_str, writer);
14667 if (ret != 0)
14668 return ret;
14669 arg->sign = 1;
14670 break;
14671 }
14672
14673 case 'e':
14674 case 'E':
14675 case 'f':
14676 case 'F':
14677 case 'g':
14678 case 'G':
14679 if (arg->width == -1 && arg->prec == -1
14680 && !(arg->flags & (F_SIGN | F_BLANK)))
14681 {
14682 /* Fast path */
14683 if (formatfloat(v, arg, NULL, writer) == -1)
14684 return -1;
14685 return 1;
14686 }
14687
14688 arg->sign = 1;
14689 if (formatfloat(v, arg, p_str, NULL) == -1)
14690 return -1;
14691 break;
14692
14693 case 'c':
14694 {
14695 Py_UCS4 ch = formatchar(v);
14696 if (ch == (Py_UCS4) -1)
14697 return -1;
14698 if (arg->width == -1 && arg->prec == -1) {
14699 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014700 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014701 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014702 return 1;
14703 }
14704 *p_str = PyUnicode_FromOrdinal(ch);
14705 break;
14706 }
14707
14708 default:
14709 PyErr_Format(PyExc_ValueError,
14710 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014711 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014712 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14713 (int)arg->ch,
14714 ctx->fmtpos - 1);
14715 return -1;
14716 }
14717 if (*p_str == NULL)
14718 return -1;
14719 assert (PyUnicode_Check(*p_str));
14720 return 0;
14721}
14722
14723static int
14724unicode_format_arg_output(struct unicode_formatter_t *ctx,
14725 struct unicode_format_arg_t *arg,
14726 PyObject *str)
14727{
14728 Py_ssize_t len;
14729 enum PyUnicode_Kind kind;
14730 void *pbuf;
14731 Py_ssize_t pindex;
14732 Py_UCS4 signchar;
14733 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014734 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014735 Py_ssize_t sublen;
14736 _PyUnicodeWriter *writer = &ctx->writer;
14737 Py_UCS4 fill;
14738
14739 fill = ' ';
14740 if (arg->sign && arg->flags & F_ZERO)
14741 fill = '0';
14742
14743 if (PyUnicode_READY(str) == -1)
14744 return -1;
14745
14746 len = PyUnicode_GET_LENGTH(str);
14747 if ((arg->width == -1 || arg->width <= len)
14748 && (arg->prec == -1 || arg->prec >= len)
14749 && !(arg->flags & (F_SIGN | F_BLANK)))
14750 {
14751 /* Fast path */
14752 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14753 return -1;
14754 return 0;
14755 }
14756
14757 /* Truncate the string for "s", "r" and "a" formats
14758 if the precision is set */
14759 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14760 if (arg->prec >= 0 && len > arg->prec)
14761 len = arg->prec;
14762 }
14763
14764 /* Adjust sign and width */
14765 kind = PyUnicode_KIND(str);
14766 pbuf = PyUnicode_DATA(str);
14767 pindex = 0;
14768 signchar = '\0';
14769 if (arg->sign) {
14770 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14771 if (ch == '-' || ch == '+') {
14772 signchar = ch;
14773 len--;
14774 pindex++;
14775 }
14776 else if (arg->flags & F_SIGN)
14777 signchar = '+';
14778 else if (arg->flags & F_BLANK)
14779 signchar = ' ';
14780 else
14781 arg->sign = 0;
14782 }
14783 if (arg->width < len)
14784 arg->width = len;
14785
14786 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014787 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014788 if (!(arg->flags & F_LJUST)) {
14789 if (arg->sign) {
14790 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014791 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014792 }
14793 else {
14794 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014795 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014796 }
14797 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014798 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14799 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014800 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014801 }
14802
Victor Stinnera47082312012-10-04 02:19:54 +020014803 buflen = arg->width;
14804 if (arg->sign && len == arg->width)
14805 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014806 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014807 return -1;
14808
14809 /* Write the sign if needed */
14810 if (arg->sign) {
14811 if (fill != ' ') {
14812 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14813 writer->pos += 1;
14814 }
14815 if (arg->width > len)
14816 arg->width--;
14817 }
14818
14819 /* Write the numeric prefix for "x", "X" and "o" formats
14820 if the alternate form is used.
14821 For example, write "0x" for the "%#x" format. */
14822 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14823 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14824 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14825 if (fill != ' ') {
14826 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14827 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14828 writer->pos += 2;
14829 pindex += 2;
14830 }
14831 arg->width -= 2;
14832 if (arg->width < 0)
14833 arg->width = 0;
14834 len -= 2;
14835 }
14836
14837 /* Pad left with the fill character if needed */
14838 if (arg->width > len && !(arg->flags & F_LJUST)) {
14839 sublen = arg->width - len;
14840 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14841 writer->pos += sublen;
14842 arg->width = len;
14843 }
14844
14845 /* If padding with spaces: write sign if needed and/or numeric prefix if
14846 the alternate form is used */
14847 if (fill == ' ') {
14848 if (arg->sign) {
14849 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14850 writer->pos += 1;
14851 }
14852 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14853 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14854 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14856 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14857 writer->pos += 2;
14858 pindex += 2;
14859 }
14860 }
14861
14862 /* Write characters */
14863 if (len) {
14864 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14865 str, pindex, len);
14866 writer->pos += len;
14867 }
14868
14869 /* Pad right with the fill character if needed */
14870 if (arg->width > len) {
14871 sublen = arg->width - len;
14872 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14873 writer->pos += sublen;
14874 }
14875 return 0;
14876}
14877
14878/* Helper of PyUnicode_Format(): format one arg.
14879 Return 0 on success, raise an exception and return -1 on error. */
14880static int
14881unicode_format_arg(struct unicode_formatter_t *ctx)
14882{
14883 struct unicode_format_arg_t arg;
14884 PyObject *str;
14885 int ret;
14886
Victor Stinner8dbd4212012-12-04 09:30:24 +010014887 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014888 if (arg.ch == '%') {
14889 ctx->fmtpos++;
14890 ctx->fmtcnt--;
14891 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14892 return -1;
14893 return 0;
14894 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014895 arg.flags = 0;
14896 arg.width = -1;
14897 arg.prec = -1;
14898 arg.sign = 0;
14899 str = NULL;
14900
Victor Stinnera47082312012-10-04 02:19:54 +020014901 ret = unicode_format_arg_parse(ctx, &arg);
14902 if (ret == -1)
14903 return -1;
14904
14905 ret = unicode_format_arg_format(ctx, &arg, &str);
14906 if (ret == -1)
14907 return -1;
14908
14909 if (ret != 1) {
14910 ret = unicode_format_arg_output(ctx, &arg, str);
14911 Py_DECREF(str);
14912 if (ret == -1)
14913 return -1;
14914 }
14915
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014916 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014917 PyErr_SetString(PyExc_TypeError,
14918 "not all arguments converted during string formatting");
14919 return -1;
14920 }
14921 return 0;
14922}
14923
Alexander Belopolsky40018472011-02-26 01:02:56 +000014924PyObject *
14925PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926{
Victor Stinnera47082312012-10-04 02:19:54 +020014927 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014928
Guido van Rossumd57fd912000-03-10 22:53:23 +000014929 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014930 PyErr_BadInternalCall();
14931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014932 }
Victor Stinnera47082312012-10-04 02:19:54 +020014933
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014934 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014935 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014936
14937 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014938 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14939 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14940 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14941 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014942
Victor Stinner8f674cc2013-04-17 23:02:17 +020014943 _PyUnicodeWriter_Init(&ctx.writer);
14944 ctx.writer.min_length = ctx.fmtcnt + 100;
14945 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014946
Guido van Rossumd57fd912000-03-10 22:53:23 +000014947 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014948 ctx.arglen = PyTuple_Size(args);
14949 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014950 }
14951 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014952 ctx.arglen = -1;
14953 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954 }
Victor Stinnera47082312012-10-04 02:19:54 +020014955 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014956 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014957 ctx.dict = args;
14958 else
14959 ctx.dict = NULL;
14960 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961
Victor Stinnera47082312012-10-04 02:19:54 +020014962 while (--ctx.fmtcnt >= 0) {
14963 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014964 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014965
14966 nonfmtpos = ctx.fmtpos++;
14967 while (ctx.fmtcnt >= 0 &&
14968 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14969 ctx.fmtpos++;
14970 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014971 }
Victor Stinnera47082312012-10-04 02:19:54 +020014972 if (ctx.fmtcnt < 0) {
14973 ctx.fmtpos--;
14974 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014975 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014976
Victor Stinnercfc4c132013-04-03 01:48:39 +020014977 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14978 nonfmtpos, ctx.fmtpos) < 0)
14979 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 }
14981 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx.fmtpos++;
14983 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014984 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014985 }
14986 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014987
Victor Stinnera47082312012-10-04 02:19:54 +020014988 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014989 PyErr_SetString(PyExc_TypeError,
14990 "not all arguments converted during string formatting");
14991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014992 }
14993
Victor Stinnera47082312012-10-04 02:19:54 +020014994 if (ctx.args_owned) {
14995 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014996 }
Victor Stinnera47082312012-10-04 02:19:54 +020014997 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014998
Benjamin Peterson29060642009-01-31 22:14:21 +000014999 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015000 _PyUnicodeWriter_Dealloc(&ctx.writer);
15001 if (ctx.args_owned) {
15002 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015003 }
15004 return NULL;
15005}
15006
Jeremy Hylton938ace62002-07-17 16:30:39 +000015007static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015008unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15009
Tim Peters6d6c1a32001-08-02 04:15:00 +000015010static PyObject *
15011unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15012{
Benjamin Peterson29060642009-01-31 22:14:21 +000015013 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 static char *kwlist[] = {"object", "encoding", "errors", 0};
15015 char *encoding = NULL;
15016 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015017
Benjamin Peterson14339b62009-01-31 16:36:08 +000015018 if (type != &PyUnicode_Type)
15019 return unicode_subtype_new(type, args, kwds);
15020 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015021 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015022 return NULL;
15023 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015024 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 if (encoding == NULL && errors == NULL)
15026 return PyObject_Str(x);
15027 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015028 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015029}
15030
Guido van Rossume023fe02001-08-30 03:12:59 +000015031static PyObject *
15032unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15033{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015034 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015035 Py_ssize_t length, char_size;
15036 int share_wstr, share_utf8;
15037 unsigned int kind;
15038 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015039
Benjamin Peterson14339b62009-01-31 16:36:08 +000015040 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015042 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015045 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015046 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015047 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015048 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015051 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052 if (self == NULL) {
15053 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 return NULL;
15055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056 kind = PyUnicode_KIND(unicode);
15057 length = PyUnicode_GET_LENGTH(unicode);
15058
15059 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015060#ifdef Py_DEBUG
15061 _PyUnicode_HASH(self) = -1;
15062#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015063 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015064#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015065 _PyUnicode_STATE(self).interned = 0;
15066 _PyUnicode_STATE(self).kind = kind;
15067 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015068 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069 _PyUnicode_STATE(self).ready = 1;
15070 _PyUnicode_WSTR(self) = NULL;
15071 _PyUnicode_UTF8_LENGTH(self) = 0;
15072 _PyUnicode_UTF8(self) = NULL;
15073 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015074 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075
15076 share_utf8 = 0;
15077 share_wstr = 0;
15078 if (kind == PyUnicode_1BYTE_KIND) {
15079 char_size = 1;
15080 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15081 share_utf8 = 1;
15082 }
15083 else if (kind == PyUnicode_2BYTE_KIND) {
15084 char_size = 2;
15085 if (sizeof(wchar_t) == 2)
15086 share_wstr = 1;
15087 }
15088 else {
15089 assert(kind == PyUnicode_4BYTE_KIND);
15090 char_size = 4;
15091 if (sizeof(wchar_t) == 4)
15092 share_wstr = 1;
15093 }
15094
15095 /* Ensure we won't overflow the length. */
15096 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15097 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015098 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015100 data = PyObject_MALLOC((length + 1) * char_size);
15101 if (data == NULL) {
15102 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015103 goto onError;
15104 }
15105
Victor Stinnerc3c74152011-10-02 20:39:55 +020015106 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015107 if (share_utf8) {
15108 _PyUnicode_UTF8_LENGTH(self) = length;
15109 _PyUnicode_UTF8(self) = data;
15110 }
15111 if (share_wstr) {
15112 _PyUnicode_WSTR_LENGTH(self) = length;
15113 _PyUnicode_WSTR(self) = (wchar_t *)data;
15114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015115
Christian Heimesf051e432016-09-13 20:22:02 +020015116 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015117 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015118 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015119#ifdef Py_DEBUG
15120 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15121#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015122 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015123 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015124
15125onError:
15126 Py_DECREF(unicode);
15127 Py_DECREF(self);
15128 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015129}
15130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015131PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015132"str(object='') -> str\n\
15133str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015134\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015135Create a new string object from the given object. If encoding or\n\
15136errors is specified, then the object must expose a data buffer\n\
15137that will be decoded using the given encoding and error handler.\n\
15138Otherwise, returns the result of object.__str__() (if defined)\n\
15139or repr(object).\n\
15140encoding defaults to sys.getdefaultencoding().\n\
15141errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015142
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015143static PyObject *unicode_iter(PyObject *seq);
15144
Guido van Rossumd57fd912000-03-10 22:53:23 +000015145PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015146 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 "str", /* tp_name */
15148 sizeof(PyUnicodeObject), /* tp_size */
15149 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015150 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 (destructor)unicode_dealloc, /* tp_dealloc */
15152 0, /* tp_print */
15153 0, /* tp_getattr */
15154 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015155 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 unicode_repr, /* tp_repr */
15157 &unicode_as_number, /* tp_as_number */
15158 &unicode_as_sequence, /* tp_as_sequence */
15159 &unicode_as_mapping, /* tp_as_mapping */
15160 (hashfunc) unicode_hash, /* tp_hash*/
15161 0, /* tp_call*/
15162 (reprfunc) unicode_str, /* tp_str */
15163 PyObject_GenericGetAttr, /* tp_getattro */
15164 0, /* tp_setattro */
15165 0, /* tp_as_buffer */
15166 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015167 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 unicode_doc, /* tp_doc */
15169 0, /* tp_traverse */
15170 0, /* tp_clear */
15171 PyUnicode_RichCompare, /* tp_richcompare */
15172 0, /* tp_weaklistoffset */
15173 unicode_iter, /* tp_iter */
15174 0, /* tp_iternext */
15175 unicode_methods, /* tp_methods */
15176 0, /* tp_members */
15177 0, /* tp_getset */
15178 &PyBaseObject_Type, /* tp_base */
15179 0, /* tp_dict */
15180 0, /* tp_descr_get */
15181 0, /* tp_descr_set */
15182 0, /* tp_dictoffset */
15183 0, /* tp_init */
15184 0, /* tp_alloc */
15185 unicode_new, /* tp_new */
15186 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015187};
15188
15189/* Initialize the Unicode implementation */
15190
Victor Stinner3a50e702011-10-18 21:21:00 +020015191int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015192{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015193 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015194 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015195 0x000A, /* LINE FEED */
15196 0x000D, /* CARRIAGE RETURN */
15197 0x001C, /* FILE SEPARATOR */
15198 0x001D, /* GROUP SEPARATOR */
15199 0x001E, /* RECORD SEPARATOR */
15200 0x0085, /* NEXT LINE */
15201 0x2028, /* LINE SEPARATOR */
15202 0x2029, /* PARAGRAPH SEPARATOR */
15203 };
15204
Fred Drakee4315f52000-05-09 19:53:39 +000015205 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015206 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015207 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015209 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015210
Guido van Rossumcacfc072002-05-24 19:01:59 +000015211 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015212 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015213
15214 /* initialize the linebreak bloom filter */
15215 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015216 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015217 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015218
Christian Heimes26532f72013-07-20 14:57:16 +020015219 if (PyType_Ready(&EncodingMapType) < 0)
15220 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015221
Benjamin Petersonc4311282012-10-30 23:21:10 -040015222 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15223 Py_FatalError("Can't initialize field name iterator type");
15224
15225 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15226 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015227
Victor Stinner3a50e702011-10-18 21:21:00 +020015228 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015229}
15230
15231/* Finalize the Unicode implementation */
15232
Christian Heimesa156e092008-02-16 07:38:31 +000015233int
15234PyUnicode_ClearFreeList(void)
15235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015236 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015237}
15238
Guido van Rossumd57fd912000-03-10 22:53:23 +000015239void
Thomas Wouters78890102000-07-22 19:25:51 +000015240_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015241{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015242 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015243
Serhiy Storchaka05997252013-01-26 12:14:02 +020015244 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015245
Serhiy Storchaka05997252013-01-26 12:14:02 +020015246 for (i = 0; i < 256; i++)
15247 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015248 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015249 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015250}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015251
Walter Dörwald16807132007-05-25 13:52:07 +000015252void
15253PyUnicode_InternInPlace(PyObject **p)
15254{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015255 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015257#ifdef Py_DEBUG
15258 assert(s != NULL);
15259 assert(_PyUnicode_CHECK(s));
15260#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015262 return;
15263#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 /* If it's a subclass, we don't really know what putting
15265 it in the interned dict might do. */
15266 if (!PyUnicode_CheckExact(s))
15267 return;
15268 if (PyUnicode_CHECK_INTERNED(s))
15269 return;
15270 if (interned == NULL) {
15271 interned = PyDict_New();
15272 if (interned == NULL) {
15273 PyErr_Clear(); /* Don't leave an exception */
15274 return;
15275 }
15276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015277 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015278 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015280 if (t == NULL) {
15281 PyErr_Clear();
15282 return;
15283 }
15284 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015285 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015286 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015287 return;
15288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 /* The two references in interned are not counted by refcnt.
15290 The deallocator will take care of this */
15291 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015292 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015293}
15294
15295void
15296PyUnicode_InternImmortal(PyObject **p)
15297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 PyUnicode_InternInPlace(p);
15299 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015300 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 Py_INCREF(*p);
15302 }
Walter Dörwald16807132007-05-25 13:52:07 +000015303}
15304
15305PyObject *
15306PyUnicode_InternFromString(const char *cp)
15307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 PyObject *s = PyUnicode_FromString(cp);
15309 if (s == NULL)
15310 return NULL;
15311 PyUnicode_InternInPlace(&s);
15312 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015313}
15314
Alexander Belopolsky40018472011-02-26 01:02:56 +000015315void
15316_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015318 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015319 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 Py_ssize_t i, n;
15321 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015322
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 if (interned == NULL || !PyDict_Check(interned))
15324 return;
15325 keys = PyDict_Keys(interned);
15326 if (keys == NULL || !PyList_Check(keys)) {
15327 PyErr_Clear();
15328 return;
15329 }
Walter Dörwald16807132007-05-25 13:52:07 +000015330
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15332 detector, interned unicode strings are not forcibly deallocated;
15333 rather, we give them their stolen references back, and then clear
15334 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015335
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 n = PyList_GET_SIZE(keys);
15337 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015338 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015340 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015341 if (PyUnicode_READY(s) == -1) {
15342 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015345 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 case SSTATE_NOT_INTERNED:
15347 /* XXX Shouldn't happen */
15348 break;
15349 case SSTATE_INTERNED_IMMORTAL:
15350 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 break;
15353 case SSTATE_INTERNED_MORTAL:
15354 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 break;
15357 default:
15358 Py_FatalError("Inconsistent interned string state.");
15359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015360 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 }
15362 fprintf(stderr, "total size of all interned strings: "
15363 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15364 "mortal/immortal\n", mortal_size, immortal_size);
15365 Py_DECREF(keys);
15366 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015367 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015368}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015369
15370
15371/********************* Unicode Iterator **************************/
15372
15373typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 PyObject_HEAD
15375 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015376 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377} unicodeiterobject;
15378
15379static void
15380unicodeiter_dealloc(unicodeiterobject *it)
15381{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 _PyObject_GC_UNTRACK(it);
15383 Py_XDECREF(it->it_seq);
15384 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015385}
15386
15387static int
15388unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15389{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 Py_VISIT(it->it_seq);
15391 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015392}
15393
15394static PyObject *
15395unicodeiter_next(unicodeiterobject *it)
15396{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015397 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015398
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 assert(it != NULL);
15400 seq = it->it_seq;
15401 if (seq == NULL)
15402 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015403 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015405 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15406 int kind = PyUnicode_KIND(seq);
15407 void *data = PyUnicode_DATA(seq);
15408 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15409 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 if (item != NULL)
15411 ++it->it_index;
15412 return item;
15413 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015414
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015416 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418}
15419
15420static PyObject *
15421unicodeiter_len(unicodeiterobject *it)
15422{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 Py_ssize_t len = 0;
15424 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015425 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427}
15428
15429PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15430
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015431static PyObject *
15432unicodeiter_reduce(unicodeiterobject *it)
15433{
15434 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015435 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015436 it->it_seq, it->it_index);
15437 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015438 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015439 if (u == NULL)
15440 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015441 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015442 }
15443}
15444
15445PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15446
15447static PyObject *
15448unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15449{
15450 Py_ssize_t index = PyLong_AsSsize_t(state);
15451 if (index == -1 && PyErr_Occurred())
15452 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015453 if (it->it_seq != NULL) {
15454 if (index < 0)
15455 index = 0;
15456 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15457 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15458 it->it_index = index;
15459 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 Py_RETURN_NONE;
15461}
15462
15463PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15464
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015465static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015467 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015468 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15469 reduce_doc},
15470 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15471 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015472 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015473};
15474
15475PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15477 "str_iterator", /* tp_name */
15478 sizeof(unicodeiterobject), /* tp_basicsize */
15479 0, /* tp_itemsize */
15480 /* methods */
15481 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15482 0, /* tp_print */
15483 0, /* tp_getattr */
15484 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015485 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015486 0, /* tp_repr */
15487 0, /* tp_as_number */
15488 0, /* tp_as_sequence */
15489 0, /* tp_as_mapping */
15490 0, /* tp_hash */
15491 0, /* tp_call */
15492 0, /* tp_str */
15493 PyObject_GenericGetAttr, /* tp_getattro */
15494 0, /* tp_setattro */
15495 0, /* tp_as_buffer */
15496 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15497 0, /* tp_doc */
15498 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15499 0, /* tp_clear */
15500 0, /* tp_richcompare */
15501 0, /* tp_weaklistoffset */
15502 PyObject_SelfIter, /* tp_iter */
15503 (iternextfunc)unicodeiter_next, /* tp_iternext */
15504 unicodeiter_methods, /* tp_methods */
15505 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015506};
15507
15508static PyObject *
15509unicode_iter(PyObject *seq)
15510{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015512
Benjamin Peterson14339b62009-01-31 16:36:08 +000015513 if (!PyUnicode_Check(seq)) {
15514 PyErr_BadInternalCall();
15515 return NULL;
15516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015517 if (PyUnicode_READY(seq) == -1)
15518 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015519 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15520 if (it == NULL)
15521 return NULL;
15522 it->it_index = 0;
15523 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015524 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 _PyObject_GC_TRACK(it);
15526 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015527}
15528
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015529
15530size_t
15531Py_UNICODE_strlen(const Py_UNICODE *u)
15532{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015533 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015534}
15535
15536Py_UNICODE*
15537Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15538{
15539 Py_UNICODE *u = s1;
15540 while ((*u++ = *s2++));
15541 return s1;
15542}
15543
15544Py_UNICODE*
15545Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15546{
15547 Py_UNICODE *u = s1;
15548 while ((*u++ = *s2++))
15549 if (n-- == 0)
15550 break;
15551 return s1;
15552}
15553
15554Py_UNICODE*
15555Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15556{
15557 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015558 u1 += wcslen(u1);
15559 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015560 return s1;
15561}
15562
15563int
15564Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15565{
15566 while (*s1 && *s2 && *s1 == *s2)
15567 s1++, s2++;
15568 if (*s1 && *s2)
15569 return (*s1 < *s2) ? -1 : +1;
15570 if (*s1)
15571 return 1;
15572 if (*s2)
15573 return -1;
15574 return 0;
15575}
15576
15577int
15578Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15579{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015580 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015581 for (; n != 0; n--) {
15582 u1 = *s1;
15583 u2 = *s2;
15584 if (u1 != u2)
15585 return (u1 < u2) ? -1 : +1;
15586 if (u1 == '\0')
15587 return 0;
15588 s1++;
15589 s2++;
15590 }
15591 return 0;
15592}
15593
15594Py_UNICODE*
15595Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15596{
15597 const Py_UNICODE *p;
15598 for (p = s; *p; p++)
15599 if (*p == c)
15600 return (Py_UNICODE*)p;
15601 return NULL;
15602}
15603
15604Py_UNICODE*
15605Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15606{
15607 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015608 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015609 while (p != s) {
15610 p--;
15611 if (*p == c)
15612 return (Py_UNICODE*)p;
15613 }
15614 return NULL;
15615}
Victor Stinner331ea922010-08-10 16:37:20 +000015616
Victor Stinner71133ff2010-09-01 23:43:53 +000015617Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015618PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015619{
Victor Stinner577db2c2011-10-11 22:12:48 +020015620 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015621 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015623 if (!PyUnicode_Check(unicode)) {
15624 PyErr_BadArgument();
15625 return NULL;
15626 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015627 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015628 if (u == NULL)
15629 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015630 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015631 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015632 PyErr_NoMemory();
15633 return NULL;
15634 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015635 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015636 size *= sizeof(Py_UNICODE);
15637 copy = PyMem_Malloc(size);
15638 if (copy == NULL) {
15639 PyErr_NoMemory();
15640 return NULL;
15641 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015642 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015643 return copy;
15644}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015645
Georg Brandl66c221e2010-10-14 07:04:07 +000015646/* A _string module, to export formatter_parser and formatter_field_name_split
15647 to the string.Formatter class implemented in Python. */
15648
15649static PyMethodDef _string_methods[] = {
15650 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15651 METH_O, PyDoc_STR("split the argument as a field name")},
15652 {"formatter_parser", (PyCFunction) formatter_parser,
15653 METH_O, PyDoc_STR("parse the argument as a format string")},
15654 {NULL, NULL}
15655};
15656
15657static struct PyModuleDef _string_module = {
15658 PyModuleDef_HEAD_INIT,
15659 "_string",
15660 PyDoc_STR("string helper module"),
15661 0,
15662 _string_methods,
15663 NULL,
15664 NULL,
15665 NULL,
15666 NULL
15667};
15668
15669PyMODINIT_FUNC
15670PyInit__string(void)
15671{
15672 return PyModule_Create(&_string_module);
15673}
15674
15675
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015676#ifdef __cplusplus
15677}
15678#endif