blob: 58899adc463d6115f47d130816000ec90383e562 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957 convert a Unicode object to a wide character string.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) required to convert the unicode object. Ignore size argument.
2961
Victor Stinnerd88d9832011-09-06 02:00:05 +02002962 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002966unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002967 wchar_t *w,
2968 Py_ssize_t size)
2969{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 const wchar_t *wstr;
2972
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002973 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002974 if (wstr == NULL)
2975 return -1;
2976
Victor Stinner5593d8a2010-10-02 11:11:27 +00002977 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (size > res)
2979 size = res + 1;
2980 else
2981 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002982 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 return res;
2984 }
2985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002987}
2988
2989Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002990PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002991 wchar_t *w,
2992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyErr_BadInternalCall();
2996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002998 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999}
3000
Victor Stinner137c34c2010-09-29 10:25:54 +00003001wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003002PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 Py_ssize_t *size)
3004{
3005 wchar_t* buffer;
3006 Py_ssize_t buflen;
3007
3008 if (unicode == NULL) {
3009 PyErr_BadInternalCall();
3010 return NULL;
3011 }
3012
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003013 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 if (buflen == -1)
3015 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003016 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003021 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003022 if (buflen == -1) {
3023 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003025 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003026 if (size != NULL)
3027 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003028 return buffer;
3029}
3030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
3034PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035{
Victor Stinner8faf8212011-12-08 22:14:11 +01003036 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 PyErr_SetString(PyExc_ValueError,
3038 "chr() arg not in range(0x110000)");
3039 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003041
Victor Stinner985a82a2014-01-03 12:53:47 +01003042 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003046PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003051 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003052 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 Py_INCREF(obj);
3054 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
3056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 /* For a Unicode subtype that's not a Unicode object,
3058 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003059 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 PyErr_Format(PyExc_TypeError,
3062 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003063 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003064 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003068PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003072 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003073 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003074
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 PyErr_BadInternalCall();
3077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Decoding bytes objects is the most common case and should be fast */
3081 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003082 if (PyBytes_GET_SIZE(obj) == 0)
3083 _Py_RETURN_UNICODE_EMPTY();
3084 v = PyUnicode_Decode(
3085 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 return v;
3088 }
3089
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 PyErr_SetString(PyExc_TypeError,
3092 "decoding str is not supported");
3093 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003095
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003099 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 Py_TYPE(obj)->tp_name);
3101 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003102 }
Tim Petersced69f82003-09-16 20:30:58 +00003103
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003104 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003105 PyBuffer_Release(&buffer);
3106 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003108
Serhiy Storchaka05997252013-01-26 12:14:02 +02003109 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003111 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
Victor Stinnerebe17e02016-10-12 13:57:45 +02003114/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003117int
3118_Py_normalize_encoding(const char *encoding,
3119 char *lower,
3120 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003123 char *l;
3124 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003125 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126
Victor Stinner942889a2016-09-05 15:40:10 -07003127 assert(encoding != NULL);
3128
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003129 e = encoding;
3130 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003131 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003132 punct = 0;
3133 while (1) {
3134 char c = *e;
3135 if (c == 0) {
3136 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003137 }
Victor Stinner942889a2016-09-05 15:40:10 -07003138
3139 if (Py_ISALNUM(c) || c == '.') {
3140 if (punct && l != lower) {
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = '_';
3145 }
3146 punct = 0;
3147
3148 if (l == l_end) {
3149 return 0;
3150 }
3151 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
3153 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003154 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
Victor Stinner942889a2016-09-05 15:40:10 -07003156
3157 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003160 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 Py_ssize_t size,
3166 const char *encoding,
3167 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003168{
3169 PyObject *buffer = NULL, *unicode;
3170 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003171 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172
3173 if (encoding == NULL) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003176
Fred Drakee4315f52000-05-09 19:53:39 +00003177 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003178 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179 char *lower = buflower;
3180
3181 /* Fast paths */
3182 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183 lower += 3;
3184 if (*lower == '_') {
3185 /* Match "utf8" and "utf_8" */
3186 lower++;
3187 }
3188
3189 if (lower[0] == '8' && lower[1] == 0) {
3190 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191 }
3192 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194 }
3195 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197 }
3198 }
3199 else {
3200 if (strcmp(lower, "ascii") == 0
3201 || strcmp(lower, "us_ascii") == 0) {
3202 return PyUnicode_DecodeASCII(s, size, errors);
3203 }
Steve Dowercc16be82016-09-08 10:35:16 -07003204 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003205 else if (strcmp(lower, "mbcs") == 0) {
3206 return PyUnicode_DecodeMBCS(s, size, errors);
3207 }
3208 #endif
3209 else if (strcmp(lower, "latin1") == 0
3210 || strcmp(lower, "latin_1") == 0
3211 || strcmp(lower, "iso_8859_1") == 0
3212 || strcmp(lower, "iso8859_1") == 0) {
3213 return PyUnicode_DecodeLatin1(s, size, errors);
3214 }
3215 }
Victor Stinner37296e82010-06-10 13:36:23 +00003216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217
3218 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003219 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003220 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003221 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003222 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (buffer == NULL)
3224 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003225 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (unicode == NULL)
3227 goto onError;
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003230 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231 "use codecs.decode() to decode to arbitrary types",
3232 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003233 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_DECREF(unicode);
3235 goto onError;
3236 }
3237 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003238 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_XDECREF(buffer);
3242 return NULL;
3243}
3244
Alexander Belopolsky40018472011-02-26 01:02:56 +00003245PyObject *
3246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003247 const char *encoding,
3248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003252 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 }
3254
Serhiy Storchaka00939072016-10-27 21:05:49 +03003255 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256 "PyUnicode_AsDecodedObject() is deprecated; "
3257 "use PyCodec_Decode() to decode from str", 1) < 0)
3258 return NULL;
3259
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262
3263 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003264 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003265}
3266
Alexander Belopolsky40018472011-02-26 01:02:56 +00003267PyObject *
3268PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003269 const char *encoding,
3270 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003271{
3272 PyObject *v;
3273
3274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
3276 goto onError;
3277 }
3278
Serhiy Storchaka00939072016-10-27 21:05:49 +03003279 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280 "PyUnicode_AsDecodedUnicode() is deprecated; "
3281 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282 return NULL;
3283
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286
3287 /* Decode via the codec registry */
3288 v = PyCodec_Decode(unicode, encoding, errors);
3289 if (v == NULL)
3290 goto onError;
3291 if (!PyUnicode_Check(v)) {
3292 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003293 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294 "use codecs.decode() to decode to arbitrary types",
3295 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003296 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003297 Py_DECREF(v);
3298 goto onError;
3299 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003300 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003301
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 return NULL;
3304}
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 Py_ssize_t size,
3309 const char *encoding,
3310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311{
3312 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003314 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318 Py_DECREF(unicode);
3319 return v;
3320}
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322PyObject *
3323PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding,
3325 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326{
3327 PyObject *v;
3328
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 goto onError;
3332 }
3333
Serhiy Storchaka00939072016-10-27 21:05:49 +03003334 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335 "PyUnicode_AsEncodedObject() is deprecated; "
3336 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337 "or PyCodec_Encode() for generic encoding", 1) < 0)
3338 return NULL;
3339
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003340 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342
3343 /* Encode via the codec registry */
3344 v = PyCodec_Encode(unicode, encoding, errors);
3345 if (v == NULL)
3346 goto onError;
3347 return v;
3348
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003350 return NULL;
3351}
3352
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353static size_t
3354wcstombs_errorpos(const wchar_t *wstr)
3355{
3356 size_t len;
3357#if SIZEOF_WCHAR_T == 2
3358 wchar_t buf[3];
3359#else
3360 wchar_t buf[2];
3361#endif
3362 char outbuf[MB_LEN_MAX];
3363 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365#if SIZEOF_WCHAR_T == 2
3366 buf[2] = 0;
3367#else
3368 buf[1] = 0;
3369#endif
3370 start = wstr;
3371 while (*wstr != L'\0')
3372 {
3373 previous = wstr;
3374#if SIZEOF_WCHAR_T == 2
3375 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377 {
3378 buf[0] = wstr[0];
3379 buf[1] = wstr[1];
3380 wstr += 2;
3381 }
3382 else {
3383 buf[0] = *wstr;
3384 buf[1] = 0;
3385 wstr++;
3386 }
3387#else
3388 buf[0] = *wstr;
3389 wstr++;
3390#endif
3391 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003392 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 }
3395
3396 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return 0;
3398}
3399
Victor Stinner1b579672011-12-17 05:47:23 +01003400static int
3401locale_error_handler(const char *errors, int *surrogateescape)
3402{
Victor Stinner50149202015-09-22 00:26:54 +02003403 _Py_error_handler error_handler = get_error_handler(errors);
3404 switch (error_handler)
3405 {
3406 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 0;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003410 *surrogateescape = 1;
3411 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003412 default:
3413 PyErr_Format(PyExc_ValueError,
3414 "only 'strict' and 'surrogateescape' error handlers "
3415 "are supported, not '%s'",
3416 errors);
3417 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003418 }
Victor Stinner1b579672011-12-17 05:47:23 +01003419}
3420
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003422PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423{
3424 Py_ssize_t wlen, wlen2;
3425 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003427 PyObject *bytes, *reason, *exc;
3428 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003429 int surrogateescape;
3430
3431 if (locale_error_handler(errors, &surrogateescape) < 0)
3432 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433
3434 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435 if (wstr == NULL)
3436 return NULL;
3437
3438 wlen2 = wcslen(wstr);
3439 if (wlen2 != wlen) {
3440 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003441 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 return NULL;
3443 }
3444
3445 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003446 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 char *str;
3448
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003449 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 if (str == NULL) {
3451 if (error_pos == (size_t)-1) {
3452 PyErr_NoMemory();
3453 PyMem_Free(wstr);
3454 return NULL;
3455 }
3456 else {
3457 goto encode_error;
3458 }
3459 }
3460 PyMem_Free(wstr);
3461
3462 bytes = PyBytes_FromString(str);
3463 PyMem_Free(str);
3464 }
3465 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 size_t len, len2;
3468
3469 len = wcstombs(NULL, wstr, 0);
3470 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003471 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 goto encode_error;
3473 }
3474
3475 bytes = PyBytes_FromStringAndSize(NULL, len);
3476 if (bytes == NULL) {
3477 PyMem_Free(wstr);
3478 return NULL;
3479 }
3480
3481 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003483 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003485 goto encode_error;
3486 }
3487 PyMem_Free(wstr);
3488 }
3489 return bytes;
3490
3491encode_error:
3492 errmsg = strerror(errno);
3493 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003494
3495 if (error_pos == (size_t)-1)
3496 error_pos = wcstombs_errorpos(wstr);
3497
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500 wstr = Py_DecodeLocale(errmsg, &errlen);
3501 if (wstr != NULL) {
3502 reason = PyUnicode_FromWideChar(wstr, errlen);
3503 PyMem_RawFree(wstr);
3504 } else {
3505 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003506 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003507
Victor Stinner2f197072011-12-17 07:08:30 +01003508 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003509 reason = PyUnicode_FromString(
3510 "wcstombs() encountered an unencodable "
3511 "wide character");
3512 if (reason == NULL)
3513 return NULL;
3514
3515 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516 "locale", unicode,
3517 (Py_ssize_t)error_pos,
3518 (Py_ssize_t)(error_pos+1),
3519 reason);
3520 Py_DECREF(reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003523 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003524 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 return NULL;
3526}
3527
Victor Stinnerad158722010-10-27 00:25:46 +00003528PyObject *
3529PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530{
Steve Dowercc16be82016-09-08 10:35:16 -07003531#if defined(__APPLE__)
3532 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003533#else
Victor Stinner793b5312011-04-27 00:24:21 +02003534 PyInterpreterState *interp = PyThreadState_GET()->interp;
3535 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536 cannot use it to encode and decode filenames before it is loaded. Load
3537 the Python codec requires to encode at least its own filename. Use the C
3538 version of the locale codec until the codec registry is initialized and
3539 the Python codec is loaded.
3540
3541 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542 cannot only rely on it: check also interp->fscodec_initialized for
3543 subinterpreters. */
3544 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545 return PyUnicode_AsEncodedString(unicode,
3546 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003547 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
3549 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003550 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003551 }
Victor Stinnerad158722010-10-27 00:25:46 +00003552#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555PyObject *
3556PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003557 const char *encoding,
3558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559{
3560 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003561 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Fred Drakee4315f52000-05-09 19:53:39 +00003567
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (encoding == NULL) {
3569 return _PyUnicode_AsUTF8String(unicode, errors);
3570 }
3571
Fred Drakee4315f52000-05-09 19:53:39 +00003572 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003573 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574 char *lower = buflower;
3575
3576 /* Fast paths */
3577 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578 lower += 3;
3579 if (*lower == '_') {
3580 /* Match "utf8" and "utf_8" */
3581 lower++;
3582 }
3583
3584 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003586 }
3587 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589 }
3590 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003593 }
Victor Stinner942889a2016-09-05 15:40:10 -07003594 else {
3595 if (strcmp(lower, "ascii") == 0
3596 || strcmp(lower, "us_ascii") == 0) {
3597 return _PyUnicode_AsASCIIString(unicode, errors);
3598 }
Steve Dowercc16be82016-09-08 10:35:16 -07003599#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003600 else if (strcmp(lower, "mbcs") == 0) {
3601 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003603#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003604 else if (strcmp(lower, "latin1") == 0 ||
3605 strcmp(lower, "latin_1") == 0 ||
3606 strcmp(lower, "iso_8859_1") == 0 ||
3607 strcmp(lower, "iso8859_1") == 0) {
3608 return _PyUnicode_AsLatin1String(unicode, errors);
3609 }
3610 }
Victor Stinner37296e82010-06-10 13:36:23 +00003611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612
3613 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003614 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 return NULL;
3617
3618 /* The normal path */
3619 if (PyBytes_Check(v))
3620 return v;
3621
3622 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003624 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003626
3627 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003628 "encoder %s returned bytearray instead of bytes; "
3629 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003630 encoding);
3631 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003632 Py_DECREF(v);
3633 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003636 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3637 Py_DECREF(v);
3638 return b;
3639 }
3640
3641 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003642 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3643 "use codecs.encode() to encode to arbitrary types",
3644 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003645 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003646 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 return NULL;
3648}
3649
Alexander Belopolsky40018472011-02-26 01:02:56 +00003650PyObject *
3651PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003652 const char *encoding,
3653 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003654{
3655 PyObject *v;
3656
3657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
3659 goto onError;
3660 }
3661
Serhiy Storchaka00939072016-10-27 21:05:49 +03003662 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3663 "PyUnicode_AsEncodedUnicode() is deprecated; "
3664 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3665 return NULL;
3666
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003667 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003669
3670 /* Encode via the codec registry */
3671 v = PyCodec_Encode(unicode, encoding, errors);
3672 if (v == NULL)
3673 goto onError;
3674 if (!PyUnicode_Check(v)) {
3675 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003676 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3677 "use codecs.encode() to encode to arbitrary types",
3678 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003679 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003680 Py_DECREF(v);
3681 goto onError;
3682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return NULL;
3687}
3688
Victor Stinner2f197072011-12-17 07:08:30 +01003689static size_t
3690mbstowcs_errorpos(const char *str, size_t len)
3691{
3692#ifdef HAVE_MBRTOWC
3693 const char *start = str;
3694 mbstate_t mbs;
3695 size_t converted;
3696 wchar_t ch;
3697
3698 memset(&mbs, 0, sizeof mbs);
3699 while (len)
3700 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003701 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003702 if (converted == 0)
3703 /* Reached end of string */
3704 break;
3705 if (converted == (size_t)-1 || converted == (size_t)-2) {
3706 /* Conversion error or incomplete character */
3707 return str - start;
3708 }
3709 else {
3710 str += converted;
3711 len -= converted;
3712 }
3713 }
3714 /* failed to find the undecodable byte sequence */
3715 return 0;
3716#endif
3717 return 0;
3718}
3719
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003722 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003723{
3724 wchar_t smallbuf[256];
3725 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3726 wchar_t *wstr;
3727 size_t wlen, wlen2;
3728 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003729 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003730 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003731 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003732 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003733
3734 if (locale_error_handler(errors, &surrogateescape) < 0)
3735 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003736
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003737 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3738 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739 return NULL;
3740 }
3741
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003742 if (surrogateescape) {
3743 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003744 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003745 if (wstr == NULL) {
3746 if (wlen == (size_t)-1)
3747 PyErr_NoMemory();
3748 else
3749 PyErr_SetFromErrno(PyExc_OSError);
3750 return NULL;
3751 }
3752
3753 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003754 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755 }
3756 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003757 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003758#ifndef HAVE_BROKEN_MBSTOWCS
3759 wlen = mbstowcs(NULL, str, 0);
3760#else
3761 wlen = len;
3762#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003763 if (wlen == (size_t)-1)
3764 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765 if (wlen+1 <= smallbuf_len) {
3766 wstr = smallbuf;
3767 }
3768 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003769 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 if (!wstr)
3771 return PyErr_NoMemory();
3772 }
3773
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003774 wlen2 = mbstowcs(wstr, str, wlen+1);
3775 if (wlen2 == (size_t)-1) {
3776 if (wstr != smallbuf)
3777 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003778 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003779 }
3780#ifdef HAVE_BROKEN_MBSTOWCS
3781 assert(wlen2 == wlen);
3782#endif
3783 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3784 if (wstr != smallbuf)
3785 PyMem_Free(wstr);
3786 }
3787 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003788
3789decode_error:
3790 errmsg = strerror(errno);
3791 assert(errmsg != NULL);
3792
3793 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003794 wstr = Py_DecodeLocale(errmsg, &errlen);
3795 if (wstr != NULL) {
3796 reason = PyUnicode_FromWideChar(wstr, errlen);
3797 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003798 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003799
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003800 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003801 reason = PyUnicode_FromString(
3802 "mbstowcs() encountered an invalid multibyte sequence");
3803 if (reason == NULL)
3804 return NULL;
3805
3806 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3807 "locale", str, len,
3808 (Py_ssize_t)error_pos,
3809 (Py_ssize_t)(error_pos+1),
3810 reason);
3811 Py_DECREF(reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003814 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003815 }
3816 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817}
3818
3819PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003820PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003821{
3822 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003823 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826
3827PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003828PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003829 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003830 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3831}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003832
Christian Heimes5894ba72007-11-04 11:43:14 +00003833PyObject*
3834PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3835{
Steve Dowercc16be82016-09-08 10:35:16 -07003836#if defined(__APPLE__)
3837 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003838#else
Victor Stinner793b5312011-04-27 00:24:21 +02003839 PyInterpreterState *interp = PyThreadState_GET()->interp;
3840 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3841 cannot use it to encode and decode filenames before it is loaded. Load
3842 the Python codec requires to encode at least its own filename. Use the C
3843 version of the locale codec until the codec registry is initialized and
3844 the Python codec is loaded.
3845
3846 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3847 cannot only rely on it: check also interp->fscodec_initialized for
3848 subinterpreters. */
3849 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003850 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003852 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003853 }
3854 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003855 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856 }
Victor Stinnerad158722010-10-27 00:25:46 +00003857#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003858}
3859
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860
3861int
3862PyUnicode_FSConverter(PyObject* arg, void* addr)
3863{
Brett Cannonec6ce872016-09-06 15:50:29 -07003864 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003865 PyObject *output = NULL;
3866 Py_ssize_t size;
3867 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003868 if (arg == NULL) {
3869 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003870 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003871 return 1;
3872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 path = PyOS_FSPath(arg);
3874 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003875 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003876 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003877 if (PyBytes_Check(path)) {
3878 output = path;
3879 }
3880 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3881 output = PyUnicode_EncodeFSDefault(path);
3882 Py_DECREF(path);
3883 if (!output) {
3884 return 0;
3885 }
3886 assert(PyBytes_Check(output));
3887 }
3888
Victor Stinner0ea2a462010-04-30 00:22:08 +00003889 size = PyBytes_GET_SIZE(output);
3890 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003891 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003892 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003893 Py_DECREF(output);
3894 return 0;
3895 }
3896 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003897 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898}
3899
3900
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003901int
3902PyUnicode_FSDecoder(PyObject* arg, void* addr)
3903{
Brett Cannona5711202016-09-06 19:36:01 -07003904 int is_buffer = 0;
3905 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 if (arg == NULL) {
3908 Py_DECREF(*(PyObject**)addr);
3909 return 1;
3910 }
Brett Cannona5711202016-09-06 19:36:01 -07003911
3912 is_buffer = PyObject_CheckBuffer(arg);
3913 if (!is_buffer) {
3914 path = PyOS_FSPath(arg);
3915 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003916 return 0;
3917 }
Brett Cannona5711202016-09-06 19:36:01 -07003918 }
3919 else {
3920 path = arg;
3921 Py_INCREF(arg);
3922 }
3923
3924 if (PyUnicode_Check(path)) {
3925 if (PyUnicode_READY(path) == -1) {
3926 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003928 }
3929 output = path;
3930 }
3931 else if (PyBytes_Check(path) || is_buffer) {
3932 PyObject *path_bytes = NULL;
3933
3934 if (!PyBytes_Check(path) &&
3935 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3936 "path should be string, bytes, or os.PathLike, not %.200s",
3937 Py_TYPE(arg)->tp_name)) {
3938 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003939 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003940 }
3941 path_bytes = PyBytes_FromObject(path);
3942 Py_DECREF(path);
3943 if (!path_bytes) {
3944 return 0;
3945 }
3946 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3947 PyBytes_GET_SIZE(path_bytes));
3948 Py_DECREF(path_bytes);
3949 if (!output) {
3950 return 0;
3951 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003952 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 else {
3954 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003955 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003956 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003957 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003958 return 0;
3959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003960 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003961 Py_DECREF(output);
3962 return 0;
3963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003965 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003966 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 Py_DECREF(output);
3968 return 0;
3969 }
3970 *(PyObject**)addr = output;
3971 return Py_CLEANUP_SUPPORTED;
3972}
3973
3974
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003975const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977{
Christian Heimesf3863112007-11-22 07:46:41 +00003978 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003980 if (!PyUnicode_Check(unicode)) {
3981 PyErr_BadArgument();
3982 return NULL;
3983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003987 if (PyUnicode_UTF8(unicode) == NULL) {
3988 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003989 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 if (bytes == NULL)
3991 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003992 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3993 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003994 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 Py_DECREF(bytes);
3996 return NULL;
3997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003999 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004000 PyBytes_AS_STRING(bytes),
4001 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 Py_DECREF(bytes);
4003 }
4004
4005 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 *psize = PyUnicode_UTF8_LENGTH(unicode);
4007 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004008}
4009
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004010const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4014}
4015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016Py_UNICODE *
4017PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 const unsigned char *one_byte;
4020#if SIZEOF_WCHAR_T == 4
4021 const Py_UCS2 *two_bytes;
4022#else
4023 const Py_UCS4 *four_bytes;
4024 const Py_UCS4 *ucs4_end;
4025 Py_ssize_t num_surrogates;
4026#endif
4027 wchar_t *w;
4028 wchar_t *wchar_end;
4029
4030 if (!PyUnicode_Check(unicode)) {
4031 PyErr_BadArgument();
4032 return NULL;
4033 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 assert(_PyUnicode_KIND(unicode) != 0);
4037 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004039 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004041 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4042 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 num_surrogates = 0;
4044
4045 for (; four_bytes < ucs4_end; ++four_bytes) {
4046 if (*four_bytes > 0xFFFF)
4047 ++num_surrogates;
4048 }
4049
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4051 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4052 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 PyErr_NoMemory();
4054 return NULL;
4055 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004056 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004058 w = _PyUnicode_WSTR(unicode);
4059 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4060 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4062 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004063 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004065 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4066 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 }
4068 else
4069 *w = *four_bytes;
4070
4071 if (w > wchar_end) {
4072 assert(0 && "Miscalculated string end");
4073 }
4074 }
4075 *w = 0;
4076#else
4077 /* sizeof(wchar_t) == 4 */
4078 Py_FatalError("Impossible unicode object state, wstr and str "
4079 "should share memory already.");
4080 return NULL;
4081#endif
4082 }
4083 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004084 if ((size_t)_PyUnicode_LENGTH(unicode) >
4085 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4086 PyErr_NoMemory();
4087 return NULL;
4088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004089 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4090 (_PyUnicode_LENGTH(unicode) + 1));
4091 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 PyErr_NoMemory();
4093 return NULL;
4094 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004095 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4096 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4097 w = _PyUnicode_WSTR(unicode);
4098 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004100 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4101 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 for (; w < wchar_end; ++one_byte, ++w)
4103 *w = *one_byte;
4104 /* null-terminate the wstr */
4105 *w = 0;
4106 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004107 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004109 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 for (; w < wchar_end; ++two_bytes, ++w)
4111 *w = *two_bytes;
4112 /* null-terminate the wstr */
4113 *w = 0;
4114#else
4115 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004116 PyObject_FREE(_PyUnicode_WSTR(unicode));
4117 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 Py_FatalError("Impossible unicode object state, wstr "
4119 "and str should share memory already.");
4120 return NULL;
4121#endif
4122 }
4123 else {
4124 assert(0 && "This should never happen.");
4125 }
4126 }
4127 }
4128 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004129 *size = PyUnicode_WSTR_LENGTH(unicode);
4130 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004131}
4132
Alexander Belopolsky40018472011-02-26 01:02:56 +00004133Py_UNICODE *
4134PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137}
4138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139
Alexander Belopolsky40018472011-02-26 01:02:56 +00004140Py_ssize_t
4141PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142{
4143 if (!PyUnicode_Check(unicode)) {
4144 PyErr_BadArgument();
4145 goto onError;
4146 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004147 if (_PyUnicode_WSTR(unicode) == NULL) {
4148 if (PyUnicode_AsUnicode(unicode) == NULL)
4149 goto onError;
4150 }
4151 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 return -1;
4155}
4156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157Py_ssize_t
4158PyUnicode_GetLength(PyObject *unicode)
4159{
Victor Stinner07621332012-06-16 04:53:46 +02004160 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161 PyErr_BadArgument();
4162 return -1;
4163 }
Victor Stinner07621332012-06-16 04:53:46 +02004164 if (PyUnicode_READY(unicode) == -1)
4165 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 return PyUnicode_GET_LENGTH(unicode);
4167}
4168
4169Py_UCS4
4170PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4171{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004172 void *data;
4173 int kind;
4174
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004175 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4176 PyErr_BadArgument();
4177 return (Py_UCS4)-1;
4178 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004179 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004180 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 return (Py_UCS4)-1;
4182 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004183 data = PyUnicode_DATA(unicode);
4184 kind = PyUnicode_KIND(unicode);
4185 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186}
4187
4188int
4189PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4190{
4191 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004192 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return -1;
4194 }
Victor Stinner488fa492011-12-12 00:01:39 +01004195 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004196 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004197 PyErr_SetString(PyExc_IndexError, "string index out of range");
4198 return -1;
4199 }
Victor Stinner488fa492011-12-12 00:01:39 +01004200 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004201 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004202 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4203 PyErr_SetString(PyExc_ValueError, "character out of range");
4204 return -1;
4205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4207 index, ch);
4208 return 0;
4209}
4210
Alexander Belopolsky40018472011-02-26 01:02:56 +00004211const char *
4212PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004213{
Victor Stinner42cb4622010-09-01 19:39:01 +00004214 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004215}
4216
Victor Stinner554f3f02010-06-16 23:33:54 +00004217/* create or adjust a UnicodeDecodeError */
4218static void
4219make_decode_exception(PyObject **exceptionObject,
4220 const char *encoding,
4221 const char *input, Py_ssize_t length,
4222 Py_ssize_t startpos, Py_ssize_t endpos,
4223 const char *reason)
4224{
4225 if (*exceptionObject == NULL) {
4226 *exceptionObject = PyUnicodeDecodeError_Create(
4227 encoding, input, length, startpos, endpos, reason);
4228 }
4229 else {
4230 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4231 goto onError;
4232 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4233 goto onError;
4234 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4235 goto onError;
4236 }
4237 return;
4238
4239onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004240 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004241}
4242
Steve Dowercc16be82016-09-08 10:35:16 -07004243#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* error handling callback helper:
4245 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004246 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 and adjust various state variables.
4248 return 0 on success, -1 on error
4249*/
4250
Alexander Belopolsky40018472011-02-26 01:02:56 +00004251static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252unicode_decode_call_errorhandler_wchar(
4253 const char *errors, PyObject **errorHandler,
4254 const char *encoding, const char *reason,
4255 const char **input, const char **inend, Py_ssize_t *startinpos,
4256 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4257 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004259 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260
4261 PyObject *restuple = NULL;
4262 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004263 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t requiredsize;
4266 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004267 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 wchar_t *repwstr;
4269 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4272 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 *errorHandler = PyCodec_LookupError(errors);
4276 if (*errorHandler == NULL)
4277 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 }
4279
Victor Stinner554f3f02010-06-16 23:33:54 +00004280 make_decode_exception(exceptionObject,
4281 encoding,
4282 *input, *inend - *input,
4283 *startinpos, *endinpos,
4284 reason);
4285 if (*exceptionObject == NULL)
4286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004288 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004292 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297
4298 /* Copy back the bytes variables, which might have been modified by the
4299 callback */
4300 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4301 if (!inputobj)
4302 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *input = PyBytes_AS_STRING(inputobj);
4304 insize = PyBytes_GET_SIZE(inputobj);
4305 *inend = *input + insize;
4306 /* we can DECREF safely, as the exception has another reference,
4307 so the object won't go away. */
4308 Py_DECREF(inputobj);
4309
4310 if (newpos<0)
4311 newpos = insize+newpos;
4312 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004313 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 goto onError;
4315 }
4316
4317 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4318 if (repwstr == NULL)
4319 goto onError;
4320 /* need more space? (at least enough for what we
4321 have+the replacement+the rest of the string (starting
4322 at the new input position), so we won't have to check space
4323 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004324 requiredsize = *outpos;
4325 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4326 goto overflow;
4327 requiredsize += repwlen;
4328 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4329 goto overflow;
4330 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004332 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 requiredsize = 2*outsize;
4334 if (unicode_resize(output, requiredsize) < 0)
4335 goto onError;
4336 }
4337 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4338 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 *endinpos = newpos;
4340 *inptr = *input + newpos;
4341
4342 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004343 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 return 0;
4345
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 overflow:
4347 PyErr_SetString(PyExc_OverflowError,
4348 "decoded result is too long for a Python string");
4349
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 onError:
4351 Py_XDECREF(restuple);
4352 return -1;
4353}
Steve Dowercc16be82016-09-08 10:35:16 -07004354#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355
4356static int
4357unicode_decode_call_errorhandler_writer(
4358 const char *errors, PyObject **errorHandler,
4359 const char *encoding, const char *reason,
4360 const char **input, const char **inend, Py_ssize_t *startinpos,
4361 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4362 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4363{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365
4366 PyObject *restuple = NULL;
4367 PyObject *repunicode = NULL;
4368 Py_ssize_t insize;
4369 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004370 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 PyObject *inputobj = NULL;
4372
4373 if (*errorHandler == NULL) {
4374 *errorHandler = PyCodec_LookupError(errors);
4375 if (*errorHandler == NULL)
4376 goto onError;
4377 }
4378
4379 make_decode_exception(exceptionObject,
4380 encoding,
4381 *input, *inend - *input,
4382 *startinpos, *endinpos,
4383 reason);
4384 if (*exceptionObject == NULL)
4385 goto onError;
4386
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004387 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 if (restuple == NULL)
4389 goto onError;
4390 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004391 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 goto onError;
4393 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004394 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004396
4397 /* Copy back the bytes variables, which might have been modified by the
4398 callback */
4399 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4400 if (!inputobj)
4401 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004402 *input = PyBytes_AS_STRING(inputobj);
4403 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004405 /* we can DECREF safely, as the exception has another reference,
4406 so the object won't go away. */
4407 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004411 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Victor Stinner170ca6f2013-04-18 00:25:28 +02004416 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004417 if (replen > 1) {
4418 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004419 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422 goto onError;
4423 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004425 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004428 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004431 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437}
4438
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439/* --- UTF-7 Codec -------------------------------------------------------- */
4440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441/* See RFC2152 for details. We encode conservatively and decode liberally. */
4442
4443/* Three simple macros defining base-64. */
4444
4445/* Is c a base-64 character? */
4446
4447#define IS_BASE64(c) \
4448 (((c) >= 'A' && (c) <= 'Z') || \
4449 ((c) >= 'a' && (c) <= 'z') || \
4450 ((c) >= '0' && (c) <= '9') || \
4451 (c) == '+' || (c) == '/')
4452
4453/* given that c is a base-64 character, what is its base-64 value? */
4454
4455#define FROM_BASE64(c) \
4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459 (c) == '+' ? 62 : 63)
4460
4461/* What is the base-64 character of the bottom 6 bits of n? */
4462
4463#define TO_BASE64(n) \
4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself. We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471#define DECODE_DIRECT(c) \
4472 ((c) <= 127 && (c) != '+')
4473
4474/* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above). See RFC2152. This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 * alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 * !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 * ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487
Tim Petersced69f82003-09-16 20:30:58 +00004488static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489char utf7_category[128] = {
4490/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494/* sp ! " # $ % & ' ( ) * + , - . / */
4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498/* @ A B C D E F G H I J K L M N O */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502/* ` a b c d e f g h i j k l m n o */
4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504/* p q r s t u v w x y z { | } ~ del */
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506};
4507
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508/* ENCODE_DIRECT: this character should be encoded as itself. The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself. RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004513
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514#define ENCODE_DIRECT(c, directO, directWS) \
4515 ((c) < 128 && (c) > 0 && \
4516 ((utf7_category[(c)] == 0) || \
4517 (directWS && (utf7_category[(c)] == 2)) || \
4518 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Alexander Belopolsky40018472011-02-26 01:02:56 +00004520PyObject *
4521PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004522 Py_ssize_t size,
4523 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526}
4527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528/* The decoder. The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed. So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
4536PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t startinpos;
4543 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 const char *errmsg = "";
4547 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004548 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 unsigned int base64bits = 0;
4550 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004551 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *errorHandler = NULL;
4553 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004558 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004559 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004562 _PyUnicodeWriter_Init(&writer);
4563 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 e = s + size;
4567
4568 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004569 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004571 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (inShift) { /* in a base-64 section */
4574 if (IS_BASE64(ch)) { /* consume a base-64 character */
4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576 base64bits += 6;
4577 s++;
4578 if (base64bits >= 16) {
4579 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 base64bits -= 16;
4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004583 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (surrogate) {
4585 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 }
4593 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004595 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
4598 }
Victor Stinner551ac952011-11-29 22:58:13 +01004599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* first surrogate */
4601 surrogate = outCh;
4602 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
4608 }
4609 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 if (base64bits > 0) { /* left-over bits */
4612 if (base64bits >= 6) {
4613 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004614 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 errmsg = "partial character in shift sequence";
4616 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 else {
4619 /* Some bits remain; they should be zero */
4620 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004621 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 errmsg = "non-zero padding bits in shift sequence";
4623 goto utf7Error;
4624 }
4625 }
4626 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 if (surrogate && DECODE_DIRECT(ch)) {
4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629 goto onError;
4630 }
4631 surrogate = 0;
4632 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 /* '-' is absorbed; other terminating
4634 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004635 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
4638 }
4639 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 s++; /* consume '+' */
4642 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004645 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 }
4647 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004652 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 else {
4661 startinpos = s-starts;
4662 s++;
4663 errmsg = "unexpected special character";
4664 goto utf7Error;
4665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 errors, &errorHandler,
4671 "utf7", errmsg,
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 }
4676
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 /* end of string */
4678
4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004681 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 if (surrogate ||
4683 (base64bits >= 6) ||
4684 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 errors, &errorHandler,
4688 "utf7", "unterminated shift sequence",
4689 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 goto onError;
4692 if (s < e)
4693 goto restart;
4694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696
4697 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004701 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.kind, writer.data, shiftOutStart);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 _PyUnicodeWriter_Dealloc(&writer);
4707 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004708 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004709 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 }
4711 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004724 return NULL;
4725}
4726
4727
Alexander Belopolsky40018472011-02-26 01:02:56 +00004728PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729_PyUnicode_EncodeUTF7(PyObject *str,
4730 int base64SetO,
4731 int base64WhiteSpace,
4732 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004734 int kind;
4735 void *data;
4736 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 char * out;
4743 char * start;
4744
Benjamin Petersonbac79492012-01-14 13:34:47 -05004745 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 return NULL;
4747 kind = PyUnicode_KIND(str);
4748 data = PyUnicode_DATA(str);
4749 len = PyUnicode_GET_LENGTH(str);
4750
4751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004755 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004756 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004757 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 if (v == NULL)
4759 return NULL;
4760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004761 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004762 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 if (inShift) {
4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767 /* shifting out */
4768 if (base64bits) { /* output remaining bits */
4769 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770 base64buffer = 0;
4771 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
4773 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 /* Characters not in the BASE64 set implicitly unshift the sequence
4775 so no '-' is required, except if the character is itself a '-' */
4776 if (IS_BASE64(ch) || ch == '-') {
4777 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 *out++ = (char) ch;
4780 }
4781 else {
4782 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004783 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 else { /* not in a shift sequence */
4786 if (ch == '+') {
4787 *out++ = '+';
4788 *out++ = '-';
4789 }
4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 *out++ = '+';
4795 inShift = 1;
4796 goto encode_char;
4797 }
4798 }
4799 continue;
4800encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004802 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004803
Antoine Pitrou244651a2009-05-04 18:56:13 +00004804 /* code first surrogate */
4805 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 while (base64bits >= 6) {
4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809 base64bits -= 6;
4810 }
4811 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004812 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 base64bits += 16;
4815 base64buffer = (base64buffer << 16) | ch;
4816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004820 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (base64bits)
4822 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004825 if (_PyBytes_Resize(&v, out - start) < 0)
4826 return NULL;
4827 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004829PyObject *
4830PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831 Py_ssize_t size,
4832 int base64SetO,
4833 int base64WhiteSpace,
4834 const char *errors)
4835{
4836 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004837 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004838 if (tmp == NULL)
4839 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841 base64WhiteSpace, errors);
4842 Py_DECREF(tmp);
4843 return result;
4844}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846#undef IS_BASE64
4847#undef FROM_BASE64
4848#undef TO_BASE64
4849#undef DECODE_DIRECT
4850#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852/* --- UTF-8 Codec -------------------------------------------------------- */
4853
Alexander Belopolsky40018472011-02-26 01:02:56 +00004854PyObject *
4855PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004856 Py_ssize_t size,
4857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
Walter Dörwald69652032004-09-07 20:24:22 +00004859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860}
4861
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862#include "stringlib/asciilib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004866#include "stringlib/ucs1lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs2lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
4874#include "stringlib/ucs4lib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrouab868312009-01-10 15:40:25 +00004878/* Mask to quickly check whether a C 'long' contains a
4879 non-ASCII, UTF8-encoded char. */
4880#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004881# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004882#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004883# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004884#else
4885# error C 'long' size should be either 4 or 8!
4886#endif
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888static Py_ssize_t
4889ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004894 /*
4895 * Issue #17237: m68k is a bit different from most architectures in
4896 * that objects do not use "natural alignment" - for example, int and
4897 * long are only aligned at 2-byte boundaries. Therefore the assert()
4898 * won't work; also, tests have shown that skipping the "optimised
4899 * version" will even speed up m68k.
4900 */
4901#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 /* Fast path, see in STRINGLIB(utf8_decode) for
4906 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004907 /* Help allocation */
4908 const char *_p = p;
4909 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (_p < aligned_end) {
4911 unsigned long value = *(const unsigned long *) _p;
4912 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 *((unsigned long *)q) = value;
4915 _p += SIZEOF_LONG;
4916 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 p = _p;
4919 while (p < end) {
4920 if ((unsigned char)*p & 0x80)
4921 break;
4922 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004927#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (p < end) {
4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004932 /* Help allocation */
4933 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 while (_p < aligned_end) {
4935 unsigned long value = *(unsigned long *) _p;
4936 if (value & ASCII_CHAR_MASK)
4937 break;
4938 _p += SIZEOF_LONG;
4939 }
4940 p = _p;
4941 if (_p == end)
4942 break;
4943 }
4944 if ((unsigned char)*p & 0x80)
4945 break;
4946 ++p;
4947 }
4948 memcpy(dest, start, p - start);
4949 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950}
Antoine Pitrouab868312009-01-10 15:40:25 +00004951
Victor Stinner785938e2011-12-11 20:09:03 +01004952PyObject *
4953PyUnicode_DecodeUTF8Stateful(const char *s,
4954 Py_ssize_t size,
4955 const char *errors,
4956 Py_ssize_t *consumed)
4957{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004959 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004968
4969 if (size == 0) {
4970 if (consumed)
4971 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004973 }
4974
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004977 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 *consumed = 1;
4979 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004980 }
4981
Victor Stinner8f674cc2013-04-17 23:02:17 +02004982 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004983 writer.min_length = size;
4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004986
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 writer.pos = ascii_decode(s, end, writer.data);
4988 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 while (s < end) {
4990 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004992
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 if (PyUnicode_IS_ASCII(writer.buffer))
4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 } else {
5001 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 }
5004
5005 switch (ch) {
5006 case 0:
5007 if (s == end || consumed)
5008 goto End;
5009 errmsg = "unexpected end of data";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 case 1:
5014 errmsg = "invalid start byte";
5015 startinpos = s - starts;
5016 endinpos = startinpos + 1;
5017 break;
5018 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
5032 error_handler = get_error_handler(errors);
5033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Xavier de Gaye76febd02016-12-15 20:59:58 +01005086#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005087
5088/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005089 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005090
5091 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093
5094wchar_t*
5095_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 wchar_t *unicode;
5099 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100
5101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 if (!unicode)
5107 return NULL;
5108
5109 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 if (ch > 0xFF) {
5120#if SIZEOF_WCHAR_T == 4
5121 assert(0);
5122#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005123 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 /* compute and append the two surrogates: */
5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 else {
5130 if (!ch && s == e)
5131 break;
5132 /* surrogateescape */
5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 return unicode;
5138}
5139
Xavier de Gaye76febd02016-12-15 20:59:58 +01005140#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142/* Primary internal function which creates utf8 encoded bytes objects.
5143
5144 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005145 and allocate exactly as much space needed at the end. Else allocate the
5146 maximum possible needed (4 result bytes per Unicode character), and return
5147 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005148*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005149PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005150_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151{
Victor Stinner6099a032011-12-18 14:22:26 +01005152 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 void *data;
5154 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160
5161 if (PyUnicode_READY(unicode) == -1)
5162 return NULL;
5163
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005164 if (PyUnicode_UTF8(unicode))
5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167
5168 kind = PyUnicode_KIND(unicode);
5169 data = PyUnicode_DATA(unicode);
5170 size = PyUnicode_GET_LENGTH(unicode);
5171
Benjamin Petersonead6b532011-12-20 17:23:42 -06005172 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005173 default:
5174 assert(0);
5175 case PyUnicode_1BYTE_KIND:
5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177 assert(!PyUnicode_IS_ASCII(unicode));
5178 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179 case PyUnicode_2BYTE_KIND:
5180 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181 case PyUnicode_4BYTE_KIND:
5182 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188 Py_ssize_t size,
5189 const char *errors)
5190{
5191 PyObject *v, *unicode;
5192
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005193 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005194 if (unicode == NULL)
5195 return NULL;
5196 v = _PyUnicode_AsUTF8String(unicode, errors);
5197 Py_DECREF(unicode);
5198 return v;
5199}
5200
5201PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207/* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214{
5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216}
5217
5218PyObject *
5219PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
5225 const char *starts = s;
5226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005228 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005229 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005230 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005231 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 q = (unsigned char *)s;
5237 e = q + size;
5238
5239 if (byteorder)
5240 bo = *byteorder;
5241
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output
5245 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 if (bom == 0x0000FEFF) {
5249 bo = -1;
5250 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005252 else if (bom == 0xFFFE0000) {
5253 bo = 1;
5254 q += 4;
5255 }
5256 if (byteorder)
5257 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 }
5259
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (q == e) {
5261 if (consumed)
5262 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005263 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 }
5265
Victor Stinnere64322e2012-10-30 23:12:47 +01005266#ifdef WORDS_BIGENDIAN
5267 le = bo < 0;
5268#else
5269 le = bo <= 0;
5270#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005271 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005272
Victor Stinner8f674cc2013-04-17 23:02:17 +02005273 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005274 writer.min_length = (e - q + 3) / 4;
5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005276 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 while (1) {
5279 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005283 enum PyUnicode_Kind kind = writer.kind;
5284 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005285 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005286 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005287 if (le) {
5288 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 if (ch > maxch)
5291 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 q += 4;
5297 } while (q <= last);
5298 }
5299 else {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 }
5313
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005314 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 startinpos = ((const char *)q) - starts;
5317 endinpos = startinpos + 4;
5318 }
5319 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005322 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 startinpos = ((const char *)q) - starts;
5325 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 else {
5328 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 goto onError;
5331 q += 4;
5332 continue;
5333 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338
5339 /* The remaining input chars are ignored if the callback
5340 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005341 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347 }
5348
Walter Dörwald41980ca2007-08-16 21:55:45 +00005349 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005351
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 return NULL;
5361}
5362
5363PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364_PyUnicode_EncodeUTF32(PyObject *str,
5365 const char *errors,
5366 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005368 enum PyUnicode_Kind kind;
5369 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005371 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005372 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005373#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005376 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005378 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 if (!PyUnicode_Check(str)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005388 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 return NULL;
5390 kind = PyUnicode_KIND(str);
5391 data = PyUnicode_DATA(str);
5392 len = PyUnicode_GET_LENGTH(str);
5393
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005395 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005396 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005397 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398 if (v == NULL)
5399 return NULL;
5400
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005401 /* output buffer is 4-bytes aligned */
5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005403 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005406 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005411 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005412 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 else
5414 encoding = "utf-32";
5415
5416 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 }
5420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 pos = 0;
5422 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424
5425 if (kind == PyUnicode_2BYTE_KIND) {
5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 else {
5430 assert(kind == PyUnicode_4BYTE_KIND);
5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432 &out, native_ordering);
5433 }
5434 if (pos == len)
5435 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005437 rep = unicode_encode_call_errorhandler(
5438 errors, &errorHandler,
5439 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 if (!rep)
5442 goto error;
5443
5444 if (PyBytes_Check(rep)) {
5445 repsize = PyBytes_GET_SIZE(rep);
5446 if (repsize & 3) {
5447 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 "surrogates not allowed");
5450 goto error;
5451 }
5452 moreunits = repsize / 4;
5453 }
5454 else {
5455 assert(PyUnicode_Check(rep));
5456 if (PyUnicode_READY(rep) < 0)
5457 goto error;
5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459 if (!PyUnicode_IS_ASCII(rep)) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 }
5466
5467 /* four bytes are reserved for each surrogate */
5468 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 Py_ssize_t morebytes = 4 * (moreunits - 1);
5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472 /* integer overflow */
5473 PyErr_NoMemory();
5474 goto error;
5475 }
5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 }
5480
5481 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005482 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005483 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 }
5489
5490 Py_CLEAR(rep);
5491 }
5492
5493 /* Cut back to size actually needed. This is necessary for, for example,
5494 encoding of a string containing isolated surrogates and the 'ignore'
5495 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 if (nsize != PyBytes_GET_SIZE(v))
5498 _PyBytes_Resize(&v, nsize);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 error:
5504 Py_XDECREF(rep);
5505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
5507 Py_XDECREF(v);
5508 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005509}
5510
Alexander Belopolsky40018472011-02-26 01:02:56 +00005511PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513 Py_ssize_t size,
5514 const char *errors,
5515 int byteorder)
5516{
5517 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005518 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005519 if (tmp == NULL)
5520 return NULL;
5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522 Py_DECREF(tmp);
5523 return result;
5524}
5525
5526PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005527PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528{
Victor Stinnerb960b342011-11-20 19:12:52 +01005529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530}
5531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532/* --- UTF-16 Codec ------------------------------------------------------- */
5533
Tim Peters772747b2001-08-09 22:21:55 +00005534PyObject *
5535PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_ssize_t size,
5537 const char *errors,
5538 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539{
Walter Dörwald69652032004-09-07 20:24:22 +00005540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541}
5542
5543PyObject *
5544PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder,
5548 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t startinpos;
5552 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005553 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005554 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005555 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005556 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005557 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 PyObject *errorHandler = NULL;
5559 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Tim Peters772747b2001-08-09 22:21:55 +00005562 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
5565 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005566 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 if (bo == 0 && size >= 2) {
5573 const Py_UCS4 bom = (q[1] << 8) | q[0];
5574 if (bom == 0xFEFF) {
5575 q += 2;
5576 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 else if (bom == 0xFFFE) {
5579 q += 2;
5580 bo = 1;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Antoine Pitrou63065d72012-05-15 23:48:04 +02005586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005589 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005590 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005591
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005595#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005598#endif
Tim Peters772747b2001-08-09 22:21:55 +00005599
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 /* Note: size will always be longer than the resulting Unicode
5601 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005602 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005603 writer.min_length = (e - q + 1) / 2;
5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 while (1) {
5608 Py_UCS4 ch = 0;
5609 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering);
5616 else
5617 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 native_ordering);
5620 } else if (kind == PyUnicode_2BYTE_KIND) {
5621 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 native_ordering);
5624 } else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005627 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005629 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 switch (ch)
5633 {
5634 case 0:
5635 /* remaining byte at the end? (size should be even) */
5636 if (q == e || consumed)
5637 goto End;
5638 errmsg = "truncated data";
5639 startinpos = ((const char *)q) - starts;
5640 endinpos = ((const char *)e) - starts;
5641 break;
5642 /* The remaining input chars are ignored if the callback
5643 chooses to skip the input */
5644 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005645 q -= 2;
5646 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005647 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005649 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005650 endinpos = ((const char *)e) - starts;
5651 break;
5652 case 2:
5653 errmsg = "illegal encoding";
5654 startinpos = ((const char *)q) - 2 - starts;
5655 endinpos = startinpos + 2;
5656 break;
5657 case 3:
5658 errmsg = "illegal UTF-16 surrogate";
5659 startinpos = ((const char *)q) - 4 - starts;
5660 endinpos = startinpos + 2;
5661 break;
5662 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 continue;
5666 }
5667
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005669 errors,
5670 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005672 &starts,
5673 (const char **)&e,
5674 &startinpos,
5675 &endinpos,
5676 &exc,
5677 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
5681
Antoine Pitrou63065d72012-05-15 23:48:04 +02005682End:
Walter Dörwald69652032004-09-07 20:24:22 +00005683 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 Py_XDECREF(errorHandler);
5693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return NULL;
5695}
5696
Tim Peters772747b2001-08-09 22:21:55 +00005697PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005698_PyUnicode_EncodeUTF16(PyObject *str,
5699 const char *errors,
5700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005702 enum PyUnicode_Kind kind;
5703 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005704 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005706 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005708#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005709 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005710#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005711 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005712#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 const char *encoding;
5714 Py_ssize_t nsize, pos;
5715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
5717 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 if (kind == PyUnicode_4BYTE_KIND) {
5731 const Py_UCS4 *in = (const Py_UCS4 *)data;
5732 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005733 while (in < end) {
5734 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005736 }
5737 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005738 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005741 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 nsize = len + pairs + (byteorder == 0);
5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005752 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
5754 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005755 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 }
Tim Peters772747b2001-08-09 22:21:55 +00005757
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (kind == PyUnicode_1BYTE_KIND) {
5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005761 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005762
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
5769 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772
5773 pos = 0;
5774 while (pos < len) {
5775 Py_ssize_t repsize, moreunits;
5776
5777 if (kind == PyUnicode_2BYTE_KIND) {
5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 else {
5782 assert(kind == PyUnicode_4BYTE_KIND);
5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784 &out, native_ordering);
5785 }
5786 if (pos == len)
5787 break;
5788
5789 rep = unicode_encode_call_errorhandler(
5790 errors, &errorHandler,
5791 encoding, "surrogates not allowed",
5792 str, &exc, pos, pos + 1, &pos);
5793 if (!rep)
5794 goto error;
5795
5796 if (PyBytes_Check(rep)) {
5797 repsize = PyBytes_GET_SIZE(rep);
5798 if (repsize & 1) {
5799 raise_encode_exception(&exc, encoding,
5800 str, pos - 1, pos,
5801 "surrogates not allowed");
5802 goto error;
5803 }
5804 moreunits = repsize / 2;
5805 }
5806 else {
5807 assert(PyUnicode_Check(rep));
5808 if (PyUnicode_READY(rep) < 0)
5809 goto error;
5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811 if (!PyUnicode_IS_ASCII(rep)) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 }
5818
5819 /* two bytes are reserved for each surrogate */
5820 if (moreunits > 1) {
5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822 Py_ssize_t morebytes = 2 * (moreunits - 1);
5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829 goto error;
5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831 }
5832
5833 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 out += moreunits;
5836 } else /* rep is unicode */ {
5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
5840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore' handler
5847 is used. */
5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
5861#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862}
5863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866 Py_ssize_t size,
5867 const char *errors,
5868 int byteorder)
5869{
5870 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005871 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 if (tmp == NULL)
5873 return NULL;
5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875 Py_DECREF(tmp);
5876 return result;
5877}
5878
5879PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885/* --- Unicode Escape Codec ----------------------------------------------- */
5886
Fredrik Lundh06d12682001-01-24 07:59:11 +00005887static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005890_PyUnicode_DecodeUnicodeEscape(const char *s,
5891 Py_ssize_t size,
5892 const char *errors,
5893 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 PyObject *errorHandler = NULL;
5899 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005900
Eric V. Smith42454af2016-10-31 09:22:08 -04005901 // so we can remember if we've seen an invalid escape char or not
5902 *first_invalid_escape = NULL;
5903
Victor Stinner62ec3312016-09-06 17:04:34 -07005904 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005905 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005906 }
5907 /* Escaped strings will always be longer than the resulting
5908 Unicode string, so we start with size here and then reduce the
5909 length after conversion to the true value.
5910 (but if the error callback returns a long replacement string
5911 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005912 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005913 writer.min_length = size;
5914 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005916 }
5917
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 end = s + size;
5919 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005920 unsigned char c = (unsigned char) *s++;
5921 Py_UCS4 ch;
5922 int count;
5923 Py_ssize_t startinpos;
5924 Py_ssize_t endinpos;
5925 const char *message;
5926
5927#define WRITE_ASCII_CHAR(ch) \
5928 do { \
5929 assert(ch <= 127); \
5930 assert(writer.pos < writer.size); \
5931 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932 } while(0)
5933
5934#define WRITE_CHAR(ch) \
5935 do { \
5936 if (ch <= writer.maxchar) { \
5937 assert(writer.pos < writer.size); \
5938 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939 } \
5940 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941 goto onError; \
5942 } \
5943 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
5945 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005946 if (c != '\\') {
5947 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 continue;
5949 }
5950
Victor Stinner62ec3312016-09-06 17:04:34 -07005951 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005953 if (s >= end) {
5954 message = "\\ at end of string";
5955 goto error;
5956 }
5957 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958
Victor Stinner62ec3312016-09-06 17:04:34 -07005959 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005960 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 case '\n': continue;
5964 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965 case '\'': WRITE_ASCII_CHAR('\''); continue;
5966 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970 case 't': WRITE_ASCII_CHAR('\t'); continue;
5971 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005976 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 case '0': case '1': case '2': case '3':
5980 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 ch = (ch<<3) + *s++ - '0';
5984 if (s < end && '0' <= *s && *s <= '7') {
5985 ch = (ch<<3) + *s++ - '0';
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005988 WRITE_CHAR(ch);
5989 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* hex escapes */
5992 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005995 message = "truncated \\xXX escape";
5996 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006001 message = "truncated \\uXXXX escape";
6002 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006005 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 message = "truncated \\UXXXXXXXX escape";
6008 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006010 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 ch <<= 4;
6012 if (c >= '0' && c <= '9') {
6013 ch += c - '0';
6014 }
6015 else if (c >= 'a' && c <= 'f') {
6016 ch += c - ('a' - 10);
6017 }
6018 else if (c >= 'A' && c <= 'F') {
6019 ch += c - ('A' - 10);
6020 }
6021 else {
6022 break;
6023 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006024 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006025 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006026 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 }
6028
6029 /* when we get here, ch is a 32-bit unicode character */
6030 if (ch > MAX_UNICODE) {
6031 message = "illegal Unicode character";
6032 goto error;
6033 }
6034
6035 WRITE_CHAR(ch);
6036 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006039 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 if (ucnhash_CAPI == NULL) {
6041 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006042 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 if (ucnhash_CAPI == NULL) {
6045 PyErr_SetString(
6046 PyExc_UnicodeError,
6047 "\\N escapes not supported (can't load unicodedata module)"
6048 );
6049 goto onError;
6050 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006052
6053 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 const char *start = ++s;
6056 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006058 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006060 namelen = s - start;
6061 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 ch = 0xffffffff; /* in case 'getcode' messes up */
6065 if (namelen <= INT_MAX &&
6066 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067 &ch, 0)) {
6068 assert(ch <= MAX_UNICODE);
6069 WRITE_CHAR(ch);
6070 continue;
6071 }
6072 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 }
6074 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006075 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076
6077 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006078 if (*first_invalid_escape == NULL) {
6079 *first_invalid_escape = s-1; /* Back up one char, since we've
6080 already incremented s. */
6081 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 WRITE_ASCII_CHAR('\\');
6083 WRITE_CHAR(c);
6084 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086
6087 error:
6088 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006089 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006090 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006091 errors, &errorHandler,
6092 "unicodeescape", message,
6093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006096 }
6097 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098 goto onError;
6099 }
6100
6101#undef WRITE_ASCII_CHAR
6102#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006104
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 return NULL;
6114}
6115
Eric V. Smith42454af2016-10-31 09:22:08 -04006116PyObject *
6117PyUnicode_DecodeUnicodeEscape(const char *s,
6118 Py_ssize_t size,
6119 const char *errors)
6120{
6121 const char *first_invalid_escape;
6122 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123 &first_invalid_escape);
6124 if (result == NULL)
6125 return NULL;
6126 if (first_invalid_escape != NULL) {
6127 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128 "invalid escape sequence '\\%c'",
6129 *first_invalid_escape) < 0) {
6130 Py_DECREF(result);
6131 return NULL;
6132 }
6133 }
6134 return result;
6135}
6136
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006137/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Alexander Belopolsky40018472011-02-26 01:02:56 +00006139PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Ezio Melottie7f90372012-10-05 03:33:31 +03006149 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006150 escape.
6151
Ezio Melottie7f90372012-10-05 03:33:31 +03006152 For UCS1 strings it's '\xxx', 4 bytes per source character.
6153 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006155 */
6156
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 if (!PyUnicode_Check(unicode)) {
6158 PyErr_BadArgument();
6159 return NULL;
6160 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 }
Victor Stinner358af132015-10-12 22:36:57 +02006164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 if (len == 0) {
6167 return PyBytes_FromStringAndSize(NULL, 0);
6168 }
6169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173 bytes, and 1 byte characters 4. */
6174 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006175 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 return PyErr_NoMemory();
6177 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006178 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 if (repr == NULL) {
6180 return NULL;
6181 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006185 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 /* U+0000-U+00ff range */
6188 if (ch < 0x100) {
6189 if (ch >= ' ' && ch < 127) {
6190 if (ch != '\\') {
6191 /* Copy printable US ASCII as-is */
6192 *p++ = (char) ch;
6193 }
6194 /* Escape backslashes */
6195 else {
6196 *p++ = '\\';
6197 *p++ = '\\';
6198 }
6199 }
Victor Stinner358af132015-10-12 22:36:57 +02006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 /* Map special whitespace to '\t', \n', '\r' */
6202 else if (ch == '\t') {
6203 *p++ = '\\';
6204 *p++ = 't';
6205 }
6206 else if (ch == '\n') {
6207 *p++ = '\\';
6208 *p++ = 'n';
6209 }
6210 else if (ch == '\r') {
6211 *p++ = '\\';
6212 *p++ = 'r';
6213 }
6214
6215 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216 else {
6217 *p++ = '\\';
6218 *p++ = 'x';
6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220 *p++ = Py_hexdigits[ch & 0x000F];
6221 }
Tim Petersced69f82003-09-16 20:30:58 +00006222 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006223 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 *p++ = '\\';
6226 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006227 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 /* Make sure that the first two digits are zero */
6236 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006237 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 *p++ = 'U';
6239 *p++ = '0';
6240 *p++ = '0';
6241 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 assert(p - PyBytes_AS_STRING(repr) > 0);
6251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252 return NULL;
6253 }
6254 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255}
6256
Alexander Belopolsky40018472011-02-26 01:02:56 +00006257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006258PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006261 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006262 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 }
6266
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006267 result = PyUnicode_AsUnicodeEscapeString(tmp);
6268 Py_DECREF(tmp);
6269 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270}
6271
6272/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274PyObject *
6275PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006276 Py_ssize_t size,
6277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006280 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 PyObject *errorHandler = NULL;
6283 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Victor Stinner62ec3312016-09-06 17:04:34 -07006285 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006286 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Escaped strings will always be longer than the resulting
6290 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 length after conversion to the true value. (But decoding error
6292 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006293 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 writer.min_length = size;
6295 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296 goto onError;
6297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 end = s + size;
6300 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 unsigned char c = (unsigned char) *s++;
6302 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006303 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 Py_ssize_t startinpos;
6305 Py_ssize_t endinpos;
6306 const char *message;
6307
6308#define WRITE_CHAR(ch) \
6309 do { \
6310 if (ch <= writer.maxchar) { \
6311 assert(writer.pos < writer.size); \
6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313 } \
6314 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315 goto onError; \
6316 } \
6317 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 if (c != '\\' || s >= end) {
6321 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006324
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 c = (unsigned char) *s++;
6326 if (c == 'u') {
6327 count = 4;
6328 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 else if (c == 'U') {
6331 count = 8;
6332 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006333 }
6334 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 assert(writer.pos < writer.size);
6336 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337 WRITE_CHAR(c);
6338 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 startinpos = s - starts - 2;
6341
6342 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343 for (ch = 0; count && s < end; ++s, --count) {
6344 c = (unsigned char)*s;
6345 ch <<= 4;
6346 if (c >= '0' && c <= '9') {
6347 ch += c - '0';
6348 }
6349 else if (c >= 'a' && c <= 'f') {
6350 ch += c - ('a' - 10);
6351 }
6352 else if (c >= 'A' && c <= 'F') {
6353 ch += c - ('A' - 10);
6354 }
6355 else {
6356 break;
6357 }
6358 }
6359 if (!count) {
6360 if (ch <= MAX_UNICODE) {
6361 WRITE_CHAR(ch);
6362 continue;
6363 }
6364 message = "\\Uxxxxxxxx out of range";
6365 }
6366
6367 endinpos = s-starts;
6368 writer.min_length = end - s + writer.pos;
6369 if (unicode_decode_call_errorhandler_writer(
6370 errors, &errorHandler,
6371 "rawunicodeescape", message,
6372 &starts, &end, &startinpos, &endinpos, &exc, &s,
6373 &writer)) {
6374 goto onError;
6375 }
6376 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377 goto onError;
6378 }
6379
6380#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 Py_XDECREF(errorHandler);
6383 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006385
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006387 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392}
6393
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 int kind;
6402 void *data;
6403 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405 if (!PyUnicode_Check(unicode)) {
6406 PyErr_BadArgument();
6407 return NULL;
6408 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 kind = PyUnicode_KIND(unicode);
6413 data = PyUnicode_DATA(unicode);
6414 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (kind == PyUnicode_1BYTE_KIND) {
6416 return PyBytes_FromStringAndSize(data, len);
6417 }
Victor Stinner0e368262011-11-10 20:12:49 +01006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420 bytes, and 1 byte characters 4. */
6421 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (len > PY_SSIZE_T_MAX / expandsize) {
6424 return PyErr_NoMemory();
6425 }
6426 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427 if (repr == NULL) {
6428 return NULL;
6429 }
6430 if (len == 0) {
6431 return repr;
6432 }
6433
6434 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 for (pos = 0; pos < len; pos++) {
6436 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006437
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439 if (ch < 0x100) {
6440 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006441 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 *p++ = '\\';
6445 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006446 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452 else {
6453 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454 *p++ = '\\';
6455 *p++ = 'U';
6456 *p++ = '0';
6457 *p++ = '0';
6458 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
6464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006466
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 assert(p > PyBytes_AS_STRING(repr));
6468 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469 return NULL;
6470 }
6471 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006475PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006478 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006479 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006481 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006482 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483 Py_DECREF(tmp);
6484 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485}
6486
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487/* --- Unicode Internal Codec ------------------------------------------- */
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
6490_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006491 Py_ssize_t size,
6492 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006493{
6494 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006497 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498 const char *end;
6499 const char *reason;
6500 PyObject *errorHandler = NULL;
6501 PyObject *exc = NULL;
6502
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006504 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 1))
6506 return NULL;
6507
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006508 if (size == 0)
6509 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006510
Victor Stinner8f674cc2013-04-17 23:02:17 +02006511 _PyUnicodeWriter_Init(&writer);
6512 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006515 }
6516 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517
Victor Stinner8f674cc2013-04-17 23:02:17 +02006518 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006519 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006520 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006521 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006522 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006523 endinpos = end-starts;
6524 reason = "truncated input";
6525 goto error;
6526 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006527 /* We copy the raw representation one byte at a time because the
6528 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006529 ((char *) &uch)[0] = s[0];
6530 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006531#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006532 ((char *) &uch)[2] = s[2];
6533 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006534#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006536#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 /* We have to sanity check the raw data, otherwise doom looms for
6538 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006539 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006540 endinpos = s - starts + Py_UNICODE_SIZE;
6541 reason = "illegal code point (> 0x10FFFF)";
6542 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006543 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006544#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006545 s += Py_UNICODE_SIZE;
6546#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006548 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 Py_UNICODE uch2;
6550 ((char *) &uch2)[0] = s[0];
6551 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006552 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006553 {
Victor Stinner551ac952011-11-29 22:58:13 +01006554 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006555 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 }
6557 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558#endif
6559
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562 continue;
6563
6564 error:
6565 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006566 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006567 errors, &errorHandler,
6568 "unicode_internal", reason,
6569 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006570 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006571 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 }
6573
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006576 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006577
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006579 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
6582 return NULL;
6583}
6584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585/* --- Latin-1 Codec ------------------------------------------------------ */
6586
Alexander Belopolsky40018472011-02-26 01:02:56 +00006587PyObject *
6588PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006589 Py_ssize_t size,
6590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006593 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static void
6598make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006599 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006600 PyObject *unicode,
6601 Py_ssize_t startpos, Py_ssize_t endpos,
6602 const char *reason)
6603{
6604 if (*exceptionObject == NULL) {
6605 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006607 encoding, unicode, startpos, endpos, reason);
6608 }
6609 else {
6610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613 goto onError;
6614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615 goto onError;
6616 return;
6617 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006618 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006619 }
6620}
6621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623static void
6624raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006626 PyObject *unicode,
6627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 const char *reason)
6629{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006630 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006631 encoding, unicode, startpos, endpos, reason);
6632 if (*exceptionObject != NULL)
6633 PyCodec_StrictErrors(*exceptionObject);
6634}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635
6636/* error handling callback helper:
6637 build arguments, call the callback and check the arguments,
6638 put the result into newpos and return the replacement string, which
6639 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
6641unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 PyObject **errorHandler,
6643 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006648 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 PyObject *restuple;
6651 PyObject *resunicode;
6652
6653 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 }
6658
Benjamin Petersonbac79492012-01-14 13:34:47 -05006659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 return NULL;
6661 len = PyUnicode_GET_LENGTH(unicode);
6662
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006663 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006668 restuple = PyObject_CallFunctionObjArgs(
6669 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006673 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 Py_DECREF(restuple);
6675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006677 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 &resunicode, newpos)) {
6679 Py_DECREF(restuple);
6680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006682 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684 Py_DECREF(restuple);
6685 return NULL;
6686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006688 *newpos = len + *newpos;
6689 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 Py_DECREF(restuple);
6692 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006693 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 Py_INCREF(resunicode);
6695 Py_DECREF(restuple);
6696 return resunicode;
6697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006701 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006702 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 /* input state */
6705 Py_ssize_t pos=0, size;
6706 int kind;
6707 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 /* pointer into the output */
6709 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006710 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006712 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006714 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006715 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006716 /* output object */
6717 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718
Benjamin Petersonbac79492012-01-14 13:34:47 -05006719 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 return NULL;
6721 size = PyUnicode_GET_LENGTH(unicode);
6722 kind = PyUnicode_KIND(unicode);
6723 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 /* allocate enough for a simple encoding without
6725 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006726 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006727 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728
6729 _PyBytesWriter_Init(&writer);
6730 str = _PyBytesWriter_Alloc(&writer, size);
6731 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006735 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006738 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006740 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006744 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006747 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006749
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006750 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006752
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006753 /* Only overallocate the buffer if it's not the last write */
6754 writer.overallocate = (collend < size);
6755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006757 if (error_handler == _Py_ERROR_UNKNOWN)
6758 error_handler = get_error_handler(errors);
6759
6760 switch (error_handler) {
6761 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006762 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006764
6765 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006766 memset(str, '?', collend - collstart);
6767 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006768 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006769 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 break;
Victor Stinner50149202015-09-22 00:26:54 +02006772
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006773 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006774 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006775 writer.min_size -= (collend - collstart);
6776 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006777 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006778 if (str == NULL)
6779 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006780 pos = collend;
6781 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006782
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006783 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006784 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006785 writer.min_size -= (collend - collstart);
6786 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 unicode, collstart, collend);
6788 if (str == NULL)
6789 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 break;
Victor Stinner50149202015-09-22 00:26:54 +02006792
Victor Stinnerc3713e92015-09-29 12:32:13 +02006793 case _Py_ERROR_SURROGATEESCAPE:
6794 for (i = collstart; i < collend; ++i) {
6795 ch = PyUnicode_READ(kind, data, i);
6796 if (ch < 0xdc80 || 0xdcff < ch) {
6797 /* Not a UTF-8b surrogate */
6798 break;
6799 }
6800 *str++ = (char)(ch - 0xdc00);
6801 ++pos;
6802 }
6803 if (i >= collend)
6804 break;
6805 collstart = pos;
6806 assert(collstart != collend);
6807 /* fallback to general error handling */
6808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006810 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811 encoding, reason, unicode, &exc,
6812 collstart, collend, &newpos);
6813 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006815
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006816 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006817 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006818
Victor Stinner6bd525b2015-10-09 13:10:05 +02006819 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006820 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006821 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006822 PyBytes_AS_STRING(rep),
6823 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006824 if (str == NULL)
6825 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006826 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 else {
6828 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006833 if (limit == 256 ?
6834 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6835 !PyUnicode_IS_ASCII(rep))
6836 {
6837 /* Not all characters are smaller than limit */
6838 raise_encode_exception(&exc, encoding, unicode,
6839 collstart, collend, reason);
6840 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006842 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6843 str = _PyBytesWriter_WriteBytes(&writer, str,
6844 PyUnicode_DATA(rep),
6845 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006849 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850
6851 /* If overallocation was disabled, ensure that it was the last
6852 write. Otherwise, we missed an optimization */
6853 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006854 }
6855 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006856
Victor Stinner50149202015-09-22 00:26:54 +02006857 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006859 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006860
6861 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006864 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006865 Py_XDECREF(exc);
6866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867}
6868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyObject *
6871PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006872 Py_ssize_t size,
6873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006876 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 if (unicode == NULL)
6878 return NULL;
6879 result = unicode_encode_ucs1(unicode, errors, 256);
6880 Py_DECREF(unicode);
6881 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882}
6883
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006885_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 PyErr_BadArgument();
6889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891 if (PyUnicode_READY(unicode) == -1)
6892 return NULL;
6893 /* Fast path: if it is a one-byte string, construct
6894 bytes object directly. */
6895 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6896 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6897 PyUnicode_GET_LENGTH(unicode));
6898 /* Non-Latin-1 characters present. Defer to above function to
6899 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901}
6902
6903PyObject*
6904PyUnicode_AsLatin1String(PyObject *unicode)
6905{
6906 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
6909/* --- 7-bit ASCII Codec -------------------------------------------------- */
6910
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_DecodeASCII(const char *s,
6913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006917 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006918 int kind;
6919 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920 Py_ssize_t startinpos;
6921 Py_ssize_t endinpos;
6922 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006924 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006926 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006929 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006932 if (size == 1 && (unsigned char)s[0] < 128)
6933 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006934
Victor Stinner8f674cc2013-04-17 23:02:17 +02006935 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006936 writer.min_length = size;
6937 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006938 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006941 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006942 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006943 writer.pos = outpos;
6944 if (writer.pos == size)
6945 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006946
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006947 s += writer.pos;
6948 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006949 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006950 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006952 PyUnicode_WRITE(kind, data, writer.pos, c);
6953 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006955 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006957
6958 /* byte outsize range 0x00..0x7f: call the error handler */
6959
6960 if (error_handler == _Py_ERROR_UNKNOWN)
6961 error_handler = get_error_handler(errors);
6962
6963 switch (error_handler)
6964 {
6965 case _Py_ERROR_REPLACE:
6966 case _Py_ERROR_SURROGATEESCAPE:
6967 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006968 but we may switch to UCS2 at the first write */
6969 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6970 goto onError;
6971 kind = writer.kind;
6972 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973
6974 if (error_handler == _Py_ERROR_REPLACE)
6975 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6976 else
6977 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6978 writer.pos++;
6979 ++s;
6980 break;
6981
6982 case _Py_ERROR_IGNORE:
6983 ++s;
6984 break;
6985
6986 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 startinpos = s-starts;
6988 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006989 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006990 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 "ascii", "ordinal not in range(128)",
6992 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006993 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006995 kind = writer.kind;
6996 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006999 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007001 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007002
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007005 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return NULL;
7008}
7009
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011PyObject *
7012PyUnicode_EncodeASCII(const Py_UNICODE *p,
7013 Py_ssize_t size,
7014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007017 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 if (unicode == NULL)
7019 return NULL;
7020 result = unicode_encode_ucs1(unicode, errors, 128);
7021 Py_DECREF(unicode);
7022 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
7028 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyErr_BadArgument();
7030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007032 if (PyUnicode_READY(unicode) == -1)
7033 return NULL;
7034 /* Fast path: if it is an ASCII-only string, construct bytes object
7035 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007036 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040}
7041
7042PyObject *
7043PyUnicode_AsASCIIString(PyObject *unicode)
7044{
7045 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Steve Dowercc16be82016-09-08 10:35:16 -07007048#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007049
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007050/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007051
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007052#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053#define NEED_RETRY
7054#endif
7055
Victor Stinner3a50e702011-10-18 21:21:00 +02007056#ifndef WC_ERR_INVALID_CHARS
7057# define WC_ERR_INVALID_CHARS 0x0080
7058#endif
7059
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007060static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007061code_page_name(UINT code_page, PyObject **obj)
7062{
7063 *obj = NULL;
7064 if (code_page == CP_ACP)
7065 return "mbcs";
7066 if (code_page == CP_UTF7)
7067 return "CP_UTF7";
7068 if (code_page == CP_UTF8)
7069 return "CP_UTF8";
7070
7071 *obj = PyBytes_FromFormat("cp%u", code_page);
7072 if (*obj == NULL)
7073 return NULL;
7074 return PyBytes_AS_STRING(*obj);
7075}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077static DWORD
7078decode_code_page_flags(UINT code_page)
7079{
7080 if (code_page == CP_UTF7) {
7081 /* The CP_UTF7 decoder only supports flags=0 */
7082 return 0;
7083 }
7084 else
7085 return MB_ERR_INVALID_CHARS;
7086}
7087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 * Decode a byte string from a Windows code page into unicode object in strict
7090 * mode.
7091 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007092 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7093 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007096decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 const char *in,
7099 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100{
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007102 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
7105 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 assert(insize > 0);
7107 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7108 if (outsize <= 0)
7109 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110
7111 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007113 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 if (*v == NULL)
7116 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118 }
7119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007122 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125 }
7126
7127 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7129 if (outsize <= 0)
7130 goto error;
7131 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133error:
7134 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7135 return -2;
7136 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007137 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138}
7139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140/*
7141 * Decode a byte string from a code page into unicode object with an error
7142 * handler.
7143 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007144 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 * UnicodeDecodeError exception and returns -1 on error.
7146 */
7147static int
7148decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007149 PyObject **v,
7150 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007151 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007152{
7153 const char *startin = in;
7154 const char *endin = in + size;
7155 const DWORD flags = decode_code_page_flags(code_page);
7156 /* Ideally, we should get reason from FormatMessage. This is the Windows
7157 2000 English version of the message. */
7158 const char *reason = "No mapping for the Unicode character exists "
7159 "in the target code page.";
7160 /* each step cannot decode more than 1 character, but a character can be
7161 represented as a surrogate pair */
7162 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007163 int insize;
7164 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 PyObject *errorHandler = NULL;
7166 PyObject *exc = NULL;
7167 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007168 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 DWORD err;
7170 int ret = -1;
7171
7172 assert(size > 0);
7173
7174 encoding = code_page_name(code_page, &encoding_obj);
7175 if (encoding == NULL)
7176 return -1;
7177
Victor Stinner7d00cc12014-03-17 23:08:06 +01007178 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7180 UnicodeDecodeError. */
7181 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7182 if (exc != NULL) {
7183 PyCodec_StrictErrors(exc);
7184 Py_CLEAR(exc);
7185 }
7186 goto error;
7187 }
7188
7189 if (*v == NULL) {
7190 /* Create unicode object */
7191 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7192 PyErr_NoMemory();
7193 goto error;
7194 }
Victor Stinnerab595942011-12-17 04:59:06 +01007195 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007196 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 if (*v == NULL)
7198 goto error;
7199 startout = PyUnicode_AS_UNICODE(*v);
7200 }
7201 else {
7202 /* Extend unicode object */
7203 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7204 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007208 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 goto error;
7210 startout = PyUnicode_AS_UNICODE(*v) + n;
7211 }
7212
7213 /* Decode the byte string character per character */
7214 out = startout;
7215 while (in < endin)
7216 {
7217 /* Decode a character */
7218 insize = 1;
7219 do
7220 {
7221 outsize = MultiByteToWideChar(code_page, flags,
7222 in, insize,
7223 buffer, Py_ARRAY_LENGTH(buffer));
7224 if (outsize > 0)
7225 break;
7226 err = GetLastError();
7227 if (err != ERROR_NO_UNICODE_TRANSLATION
7228 && err != ERROR_INSUFFICIENT_BUFFER)
7229 {
7230 PyErr_SetFromWindowsErr(0);
7231 goto error;
7232 }
7233 insize++;
7234 }
7235 /* 4=maximum length of a UTF-8 sequence */
7236 while (insize <= 4 && (in + insize) <= endin);
7237
7238 if (outsize <= 0) {
7239 Py_ssize_t startinpos, endinpos, outpos;
7240
Victor Stinner7d00cc12014-03-17 23:08:06 +01007241 /* last character in partial decode? */
7242 if (in + insize >= endin && !final)
7243 break;
7244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 startinpos = in - startin;
7246 endinpos = startinpos + 1;
7247 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007248 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 errors, &errorHandler,
7250 encoding, reason,
7251 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007252 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 {
7254 goto error;
7255 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007256 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 }
7258 else {
7259 in += insize;
7260 memcpy(out, buffer, outsize * sizeof(wchar_t));
7261 out += outsize;
7262 }
7263 }
7264
7265 /* write a NUL character at the end */
7266 *out = 0;
7267
7268 /* Extend unicode object */
7269 outsize = out - startout;
7270 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007271 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007273 /* (in - startin) <= size and size is an int */
7274 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007275
7276error:
7277 Py_XDECREF(encoding_obj);
7278 Py_XDECREF(errorHandler);
7279 Py_XDECREF(exc);
7280 return ret;
7281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283static PyObject *
7284decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 const char *s, Py_ssize_t size,
7286 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287{
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 PyObject *v = NULL;
7289 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 if (code_page < 0) {
7292 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7293 return NULL;
7294 }
7295
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 do
7300 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 if (size > INT_MAX) {
7303 chunk_size = INT_MAX;
7304 final = 0;
7305 done = 0;
7306 }
7307 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 {
7310 chunk_size = (int)size;
7311 final = (consumed == NULL);
7312 done = 1;
7313 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
Victor Stinner76a31a62011-11-04 00:05:13 +01007315 if (chunk_size == 0 && done) {
7316 if (v != NULL)
7317 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007318 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 converted = decode_code_page_strict(code_page, &v,
7322 s, chunk_size);
7323 if (converted == -2)
7324 converted = decode_code_page_errors(code_page, &v,
7325 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007326 errors, final);
7327 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007328
7329 if (converted < 0) {
7330 Py_XDECREF(v);
7331 return NULL;
7332 }
7333
7334 if (consumed)
7335 *consumed += converted;
7336
7337 s += converted;
7338 size -= converted;
7339 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007340
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007341 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342}
7343
Alexander Belopolsky40018472011-02-26 01:02:56 +00007344PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007345PyUnicode_DecodeCodePageStateful(int code_page,
7346 const char *s,
7347 Py_ssize_t size,
7348 const char *errors,
7349 Py_ssize_t *consumed)
7350{
7351 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7352}
7353
7354PyObject *
7355PyUnicode_DecodeMBCSStateful(const char *s,
7356 Py_ssize_t size,
7357 const char *errors,
7358 Py_ssize_t *consumed)
7359{
7360 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7361}
7362
7363PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007364PyUnicode_DecodeMBCS(const char *s,
7365 Py_ssize_t size,
7366 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7369}
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371static DWORD
7372encode_code_page_flags(UINT code_page, const char *errors)
7373{
7374 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007375 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 }
7377 else if (code_page == CP_UTF7) {
7378 /* CP_UTF7 only supports flags=0 */
7379 return 0;
7380 }
7381 else {
7382 if (errors != NULL && strcmp(errors, "replace") == 0)
7383 return 0;
7384 else
7385 return WC_NO_BEST_FIT_CHARS;
7386 }
7387}
7388
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007389/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 * Encode a Unicode string to a Windows code page into a byte string in strict
7391 * mode.
7392 *
7393 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007394 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007396static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007397encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400{
Victor Stinner554f3f02010-06-16 23:33:54 +00007401 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 BOOL *pusedDefaultChar = &usedDefaultChar;
7403 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007404 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 const DWORD flags = encode_code_page_flags(code_page, NULL);
7407 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007408 /* Create a substring so that we can get the UTF-16 representation
7409 of just the slice under consideration. */
7410 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411
Martin v. Löwis3d325192011-11-04 18:23:06 +01007412 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007415 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007417 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007418
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 substring = PyUnicode_Substring(unicode, offset, offset+len);
7420 if (substring == NULL)
7421 return -1;
7422 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7423 if (p == NULL) {
7424 Py_DECREF(substring);
7425 return -1;
7426 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007427 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007429 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007431 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 NULL, 0,
7433 NULL, pusedDefaultChar);
7434 if (outsize <= 0)
7435 goto error;
7436 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 if (pusedDefaultChar && *pusedDefaultChar) {
7438 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 if (*outbytes == NULL) {
7446 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450 }
7451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const Py_ssize_t n = PyBytes_Size(*outbytes);
7454 if (outsize > PY_SSIZE_T_MAX - n) {
7455 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464 }
7465
7466 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007468 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 out, outsize,
7470 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 if (outsize <= 0)
7473 goto error;
7474 if (pusedDefaultChar && *pusedDefaultChar)
7475 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481 return -2;
7482 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007483 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007484}
7485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007487 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 * error handler.
7489 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007490 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 * -1 on other error.
7492 */
7493static int
7494encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007497{
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 Py_ssize_t pos = unicode_offset;
7500 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 /* Ideally, we should get reason from FormatMessage. This is the Windows
7502 2000 English version of the message. */
7503 const char *reason = "invalid character";
7504 /* 4=maximum length of a UTF-8 sequence */
7505 char buffer[4];
7506 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7507 Py_ssize_t outsize;
7508 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 PyObject *errorHandler = NULL;
7510 PyObject *exc = NULL;
7511 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007512 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 PyObject *rep;
7515 int ret = -1;
7516
7517 assert(insize > 0);
7518
7519 encoding = code_page_name(code_page, &encoding_obj);
7520 if (encoding == NULL)
7521 return -1;
7522
7523 if (errors == NULL || strcmp(errors, "strict") == 0) {
7524 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7525 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007526 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (exc != NULL) {
7528 PyCodec_StrictErrors(exc);
7529 Py_DECREF(exc);
7530 }
7531 Py_XDECREF(encoding_obj);
7532 return -1;
7533 }
7534
7535 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7536 pusedDefaultChar = &usedDefaultChar;
7537 else
7538 pusedDefaultChar = NULL;
7539
7540 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7541 PyErr_NoMemory();
7542 goto error;
7543 }
7544 outsize = insize * Py_ARRAY_LENGTH(buffer);
7545
7546 if (*outbytes == NULL) {
7547 /* Create string object */
7548 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7549 if (*outbytes == NULL)
7550 goto error;
7551 out = PyBytes_AS_STRING(*outbytes);
7552 }
7553 else {
7554 /* Extend string object */
7555 Py_ssize_t n = PyBytes_Size(*outbytes);
7556 if (n > PY_SSIZE_T_MAX - outsize) {
7557 PyErr_NoMemory();
7558 goto error;
7559 }
7560 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7561 goto error;
7562 out = PyBytes_AS_STRING(*outbytes) + n;
7563 }
7564
7565 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7569 wchar_t chars[2];
7570 int charsize;
7571 if (ch < 0x10000) {
7572 chars[0] = (wchar_t)ch;
7573 charsize = 1;
7574 }
7575 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007576 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7577 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 charsize = 2;
7579 }
7580
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 buffer, Py_ARRAY_LENGTH(buffer),
7584 NULL, pusedDefaultChar);
7585 if (outsize > 0) {
7586 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7587 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 memcpy(out, buffer, outsize);
7590 out += outsize;
7591 continue;
7592 }
7593 }
7594 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7595 PyErr_SetFromWindowsErr(0);
7596 goto error;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 rep = unicode_encode_call_errorhandler(
7600 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007601 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 if (rep == NULL)
7604 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007605 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606
7607 if (PyBytes_Check(rep)) {
7608 outsize = PyBytes_GET_SIZE(rep);
7609 if (outsize != 1) {
7610 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7611 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7612 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7613 Py_DECREF(rep);
7614 goto error;
7615 }
7616 out = PyBytes_AS_STRING(*outbytes) + offset;
7617 }
7618 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7619 out += outsize;
7620 }
7621 else {
7622 Py_ssize_t i;
7623 enum PyUnicode_Kind kind;
7624 void *data;
7625
Benjamin Petersonbac79492012-01-14 13:34:47 -05007626 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 Py_DECREF(rep);
7628 goto error;
7629 }
7630
7631 outsize = PyUnicode_GET_LENGTH(rep);
7632 if (outsize != 1) {
7633 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7634 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7635 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7636 Py_DECREF(rep);
7637 goto error;
7638 }
7639 out = PyBytes_AS_STRING(*outbytes) + offset;
7640 }
7641 kind = PyUnicode_KIND(rep);
7642 data = PyUnicode_DATA(rep);
7643 for (i=0; i < outsize; i++) {
7644 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7645 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007646 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007647 encoding, unicode,
7648 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 "unable to encode error handler result to ASCII");
7650 Py_DECREF(rep);
7651 goto error;
7652 }
7653 *out = (unsigned char)ch;
7654 out++;
7655 }
7656 }
7657 Py_DECREF(rep);
7658 }
7659 /* write a NUL byte */
7660 *out = 0;
7661 outsize = out - PyBytes_AS_STRING(*outbytes);
7662 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7663 if (_PyBytes_Resize(outbytes, outsize) < 0)
7664 goto error;
7665 ret = 0;
7666
7667error:
7668 Py_XDECREF(encoding_obj);
7669 Py_XDECREF(errorHandler);
7670 Py_XDECREF(exc);
7671 return ret;
7672}
7673
Victor Stinner3a50e702011-10-18 21:21:00 +02007674static PyObject *
7675encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 const char *errors)
7678{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007681 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007682 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007683
Victor Stinner29dacf22015-01-26 16:41:32 +01007684 if (!PyUnicode_Check(unicode)) {
7685 PyErr_BadArgument();
7686 return NULL;
7687 }
7688
Benjamin Petersonbac79492012-01-14 13:34:47 -05007689 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 return NULL;
7691 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 if (code_page < 0) {
7694 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7695 return NULL;
7696 }
7697
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 return PyBytes_FromStringAndSize(NULL, 0);
7700
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 offset = 0;
7702 do
7703 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 chunks. */
7707 if (len > INT_MAX/2) {
7708 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 done = 0;
7710 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007711 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007713 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 done = 1;
7716 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717
Victor Stinner76a31a62011-11-04 00:05:13 +01007718 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007720 errors);
7721 if (ret == -2)
7722 ret = encode_code_page_errors(code_page, &outbytes,
7723 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007725 if (ret < 0) {
7726 Py_XDECREF(outbytes);
7727 return NULL;
7728 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007733
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 return outbytes;
7735}
7736
7737PyObject *
7738PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7739 Py_ssize_t size,
7740 const char *errors)
7741{
Victor Stinner7581cef2011-11-03 22:32:33 +01007742 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007743 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 if (unicode == NULL)
7745 return NULL;
7746 res = encode_code_page(CP_ACP, unicode, errors);
7747 Py_DECREF(unicode);
7748 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007749}
7750
7751PyObject *
7752PyUnicode_EncodeCodePage(int code_page,
7753 PyObject *unicode,
7754 const char *errors)
7755{
Victor Stinner7581cef2011-11-03 22:32:33 +01007756 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007757}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007758
Alexander Belopolsky40018472011-02-26 01:02:56 +00007759PyObject *
7760PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007761{
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007763}
7764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#undef NEED_RETRY
7766
Steve Dowercc16be82016-09-08 10:35:16 -07007767#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007768
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769/* --- Character Mapping Codec -------------------------------------------- */
7770
Victor Stinnerfb161b12013-04-18 01:44:27 +02007771static int
7772charmap_decode_string(const char *s,
7773 Py_ssize_t size,
7774 PyObject *mapping,
7775 const char *errors,
7776 _PyUnicodeWriter *writer)
7777{
7778 const char *starts = s;
7779 const char *e;
7780 Py_ssize_t startinpos, endinpos;
7781 PyObject *errorHandler = NULL, *exc = NULL;
7782 Py_ssize_t maplen;
7783 enum PyUnicode_Kind mapkind;
7784 void *mapdata;
7785 Py_UCS4 x;
7786 unsigned char ch;
7787
7788 if (PyUnicode_READY(mapping) == -1)
7789 return -1;
7790
7791 maplen = PyUnicode_GET_LENGTH(mapping);
7792 mapdata = PyUnicode_DATA(mapping);
7793 mapkind = PyUnicode_KIND(mapping);
7794
7795 e = s + size;
7796
7797 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7798 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7799 * is disabled in encoding aliases, latin1 is preferred because
7800 * its implementation is faster. */
7801 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7802 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7803 Py_UCS4 maxchar = writer->maxchar;
7804
7805 assert (writer->kind == PyUnicode_1BYTE_KIND);
7806 while (s < e) {
7807 ch = *s;
7808 x = mapdata_ucs1[ch];
7809 if (x > maxchar) {
7810 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7811 goto onError;
7812 maxchar = writer->maxchar;
7813 outdata = (Py_UCS1 *)writer->data;
7814 }
7815 outdata[writer->pos] = x;
7816 writer->pos++;
7817 ++s;
7818 }
7819 return 0;
7820 }
7821
7822 while (s < e) {
7823 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7824 enum PyUnicode_Kind outkind = writer->kind;
7825 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7826 if (outkind == PyUnicode_1BYTE_KIND) {
7827 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828 Py_UCS4 maxchar = writer->maxchar;
7829 while (s < e) {
7830 ch = *s;
7831 x = mapdata_ucs2[ch];
7832 if (x > maxchar)
7833 goto Error;
7834 outdata[writer->pos] = x;
7835 writer->pos++;
7836 ++s;
7837 }
7838 break;
7839 }
7840 else if (outkind == PyUnicode_2BYTE_KIND) {
7841 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7842 while (s < e) {
7843 ch = *s;
7844 x = mapdata_ucs2[ch];
7845 if (x == 0xFFFE)
7846 goto Error;
7847 outdata[writer->pos] = x;
7848 writer->pos++;
7849 ++s;
7850 }
7851 break;
7852 }
7853 }
7854 ch = *s;
7855
7856 if (ch < maplen)
7857 x = PyUnicode_READ(mapkind, mapdata, ch);
7858 else
7859 x = 0xfffe; /* invalid value */
7860Error:
7861 if (x == 0xfffe)
7862 {
7863 /* undefined mapping */
7864 startinpos = s-starts;
7865 endinpos = startinpos+1;
7866 if (unicode_decode_call_errorhandler_writer(
7867 errors, &errorHandler,
7868 "charmap", "character maps to <undefined>",
7869 &starts, &e, &startinpos, &endinpos, &exc, &s,
7870 writer)) {
7871 goto onError;
7872 }
7873 continue;
7874 }
7875
7876 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7877 goto onError;
7878 ++s;
7879 }
7880 Py_XDECREF(errorHandler);
7881 Py_XDECREF(exc);
7882 return 0;
7883
7884onError:
7885 Py_XDECREF(errorHandler);
7886 Py_XDECREF(exc);
7887 return -1;
7888}
7889
7890static int
7891charmap_decode_mapping(const char *s,
7892 Py_ssize_t size,
7893 PyObject *mapping,
7894 const char *errors,
7895 _PyUnicodeWriter *writer)
7896{
7897 const char *starts = s;
7898 const char *e;
7899 Py_ssize_t startinpos, endinpos;
7900 PyObject *errorHandler = NULL, *exc = NULL;
7901 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007902 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007903
7904 e = s + size;
7905
7906 while (s < e) {
7907 ch = *s;
7908
7909 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7910 key = PyLong_FromLong((long)ch);
7911 if (key == NULL)
7912 goto onError;
7913
7914 item = PyObject_GetItem(mapping, key);
7915 Py_DECREF(key);
7916 if (item == NULL) {
7917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7918 /* No mapping found means: mapping is undefined. */
7919 PyErr_Clear();
7920 goto Undefined;
7921 } else
7922 goto onError;
7923 }
7924
7925 /* Apply mapping */
7926 if (item == Py_None)
7927 goto Undefined;
7928 if (PyLong_Check(item)) {
7929 long value = PyLong_AS_LONG(item);
7930 if (value == 0xFFFE)
7931 goto Undefined;
7932 if (value < 0 || value > MAX_UNICODE) {
7933 PyErr_Format(PyExc_TypeError,
7934 "character mapping must be in range(0x%lx)",
7935 (unsigned long)MAX_UNICODE + 1);
7936 goto onError;
7937 }
7938
7939 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7940 goto onError;
7941 }
7942 else if (PyUnicode_Check(item)) {
7943 if (PyUnicode_READY(item) == -1)
7944 goto onError;
7945 if (PyUnicode_GET_LENGTH(item) == 1) {
7946 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7947 if (value == 0xFFFE)
7948 goto Undefined;
7949 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7950 goto onError;
7951 }
7952 else {
7953 writer->overallocate = 1;
7954 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7955 goto onError;
7956 }
7957 }
7958 else {
7959 /* wrong return value */
7960 PyErr_SetString(PyExc_TypeError,
7961 "character mapping must return integer, None or str");
7962 goto onError;
7963 }
7964 Py_CLEAR(item);
7965 ++s;
7966 continue;
7967
7968Undefined:
7969 /* undefined mapping */
7970 Py_CLEAR(item);
7971 startinpos = s-starts;
7972 endinpos = startinpos+1;
7973 if (unicode_decode_call_errorhandler_writer(
7974 errors, &errorHandler,
7975 "charmap", "character maps to <undefined>",
7976 &starts, &e, &startinpos, &endinpos, &exc, &s,
7977 writer)) {
7978 goto onError;
7979 }
7980 }
7981 Py_XDECREF(errorHandler);
7982 Py_XDECREF(exc);
7983 return 0;
7984
7985onError:
7986 Py_XDECREF(item);
7987 Py_XDECREF(errorHandler);
7988 Py_XDECREF(exc);
7989 return -1;
7990}
7991
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992PyObject *
7993PyUnicode_DecodeCharmap(const char *s,
7994 Py_ssize_t size,
7995 PyObject *mapping,
7996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007998 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007999
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 /* Default to Latin-1 */
8001 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008005 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008006 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008007 writer.min_length = size;
8008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008010
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008011 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008012 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8013 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008014 }
8015 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008016 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008020
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008022 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 return NULL;
8024}
8025
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026/* Charmap encoding: the lookup table */
8027
Alexander Belopolsky40018472011-02-26 01:02:56 +00008028struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 PyObject_HEAD
8030 unsigned char level1[32];
8031 int count2, count3;
8032 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033};
8034
8035static PyObject*
8036encoding_map_size(PyObject *obj, PyObject* args)
8037{
8038 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041}
8042
8043static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 PyDoc_STR("Return the size (in bytes) of this object") },
8046 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047};
8048
8049static void
8050encoding_map_dealloc(PyObject* o)
8051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053}
8054
8055static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 "EncodingMap", /*tp_name*/
8058 sizeof(struct encoding_map), /*tp_basicsize*/
8059 0, /*tp_itemsize*/
8060 /* methods */
8061 encoding_map_dealloc, /*tp_dealloc*/
8062 0, /*tp_print*/
8063 0, /*tp_getattr*/
8064 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008065 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 0, /*tp_repr*/
8067 0, /*tp_as_number*/
8068 0, /*tp_as_sequence*/
8069 0, /*tp_as_mapping*/
8070 0, /*tp_hash*/
8071 0, /*tp_call*/
8072 0, /*tp_str*/
8073 0, /*tp_getattro*/
8074 0, /*tp_setattro*/
8075 0, /*tp_as_buffer*/
8076 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8077 0, /*tp_doc*/
8078 0, /*tp_traverse*/
8079 0, /*tp_clear*/
8080 0, /*tp_richcompare*/
8081 0, /*tp_weaklistoffset*/
8082 0, /*tp_iter*/
8083 0, /*tp_iternext*/
8084 encoding_map_methods, /*tp_methods*/
8085 0, /*tp_members*/
8086 0, /*tp_getset*/
8087 0, /*tp_base*/
8088 0, /*tp_dict*/
8089 0, /*tp_descr_get*/
8090 0, /*tp_descr_set*/
8091 0, /*tp_dictoffset*/
8092 0, /*tp_init*/
8093 0, /*tp_alloc*/
8094 0, /*tp_new*/
8095 0, /*tp_free*/
8096 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097};
8098
8099PyObject*
8100PyUnicode_BuildEncodingMap(PyObject* string)
8101{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 PyObject *result;
8103 struct encoding_map *mresult;
8104 int i;
8105 int need_dict = 0;
8106 unsigned char level1[32];
8107 unsigned char level2[512];
8108 unsigned char *mlevel1, *mlevel2, *mlevel3;
8109 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 int kind;
8111 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008112 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008115 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 PyErr_BadArgument();
8117 return NULL;
8118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 kind = PyUnicode_KIND(string);
8120 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008121 length = PyUnicode_GET_LENGTH(string);
8122 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 memset(level1, 0xFF, sizeof level1);
8124 memset(level2, 0xFF, sizeof level2);
8125
8126 /* If there isn't a one-to-one mapping of NULL to \0,
8127 or if there are non-BMP characters, we need to use
8128 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008130 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008131 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 ch = PyUnicode_READ(kind, data, i);
8134 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 need_dict = 1;
8136 break;
8137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 /* unmapped character */
8140 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 l1 = ch >> 11;
8142 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 if (level1[l1] == 0xFF)
8144 level1[l1] = count2++;
8145 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 }
8148
8149 if (count2 >= 0xFF || count3 >= 0xFF)
8150 need_dict = 1;
8151
8152 if (need_dict) {
8153 PyObject *result = PyDict_New();
8154 PyObject *key, *value;
8155 if (!result)
8156 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008157 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008159 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 if (!key || !value)
8161 goto failed1;
8162 if (PyDict_SetItem(result, key, value) == -1)
8163 goto failed1;
8164 Py_DECREF(key);
8165 Py_DECREF(value);
8166 }
8167 return result;
8168 failed1:
8169 Py_XDECREF(key);
8170 Py_XDECREF(value);
8171 Py_DECREF(result);
8172 return NULL;
8173 }
8174
8175 /* Create a three-level trie */
8176 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8177 16*count2 + 128*count3 - 1);
8178 if (!result)
8179 return PyErr_NoMemory();
8180 PyObject_Init(result, &EncodingMapType);
8181 mresult = (struct encoding_map*)result;
8182 mresult->count2 = count2;
8183 mresult->count3 = count3;
8184 mlevel1 = mresult->level1;
8185 mlevel2 = mresult->level23;
8186 mlevel3 = mresult->level23 + 16*count2;
8187 memcpy(mlevel1, level1, 32);
8188 memset(mlevel2, 0xFF, 16*count2);
8189 memset(mlevel3, 0, 128*count3);
8190 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008191 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008193 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8194 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 /* unmapped character */
8196 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008197 o1 = ch>>11;
8198 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008199 i2 = 16*mlevel1[o1] + o2;
8200 if (mlevel2[i2] == 0xFF)
8201 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008202 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203 i3 = 128*mlevel2[i2] + o3;
8204 mlevel3[i3] = i;
8205 }
8206 return result;
8207}
8208
8209static int
Victor Stinner22168992011-11-20 17:09:18 +01008210encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211{
8212 struct encoding_map *map = (struct encoding_map*)mapping;
8213 int l1 = c>>11;
8214 int l2 = (c>>7) & 0xF;
8215 int l3 = c & 0x7F;
8216 int i;
8217
Victor Stinner22168992011-11-20 17:09:18 +01008218 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 if (c == 0)
8221 return 0;
8222 /* level 1*/
8223 i = map->level1[l1];
8224 if (i == 0xFF) {
8225 return -1;
8226 }
8227 /* level 2*/
8228 i = map->level23[16*i+l2];
8229 if (i == 0xFF) {
8230 return -1;
8231 }
8232 /* level 3 */
8233 i = map->level23[16*map->count2 + 128*i + l3];
8234 if (i == 0) {
8235 return -1;
8236 }
8237 return i;
8238}
8239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240/* Lookup the character ch in the mapping. If the character
8241 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008242 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008244charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Christian Heimes217cfd12007-12-02 14:31:20 +00008246 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *x;
8248
8249 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 x = PyObject_GetItem(mapping, w);
8252 Py_DECREF(w);
8253 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255 /* No mapping found means: mapping is undefined. */
8256 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008257 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 } else
8259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008261 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008263 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 long value = PyLong_AS_LONG(x);
8265 if (value < 0 || value > 255) {
8266 PyErr_SetString(PyExc_TypeError,
8267 "character mapping must be in range(256)");
8268 Py_DECREF(x);
8269 return NULL;
8270 }
8271 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 /* wrong return value */
8277 PyErr_Format(PyExc_TypeError,
8278 "character mapping must return integer, bytes or None, not %.400s",
8279 x->ob_type->tp_name);
8280 Py_DECREF(x);
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
8283}
8284
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008286charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8289 /* exponentially overallocate to minimize reallocations */
8290 if (requiredsize < 2*outsize)
8291 requiredsize = 2*outsize;
8292 if (_PyBytes_Resize(outobj, requiredsize))
8293 return -1;
8294 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295}
8296
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008301 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 space is available. Return a new reference to the object that
8303 was put in the output buffer, or Py_None, if the mapping was undefined
8304 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008305 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008307charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310 PyObject *rep;
8311 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008312 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313
Christian Heimes90aa7642007-12-19 02:45:37 +00008314 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 if (res == -1)
8318 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 if (outsize<requiredsize)
8320 if (charmapencode_resize(outobj, outpos, requiredsize))
8321 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008322 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 outstart[(*outpos)++] = (char)res;
8324 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 }
8326
8327 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 Py_DECREF(rep);
8332 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 if (PyLong_Check(rep)) {
8335 Py_ssize_t requiredsize = *outpos+1;
8336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8338 Py_DECREF(rep);
8339 return enc_EXCEPTION;
8340 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 else {
8345 const char *repchars = PyBytes_AS_STRING(rep);
8346 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8347 Py_ssize_t requiredsize = *outpos+repsize;
8348 if (outsize<requiredsize)
8349 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8350 Py_DECREF(rep);
8351 return enc_EXCEPTION;
8352 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008353 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 memcpy(outstart + *outpos, repchars, repsize);
8355 *outpos += repsize;
8356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 Py_DECREF(rep);
8359 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360}
8361
8362/* handle an error in PyUnicode_EncodeCharmap
8363 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008364static int
8365charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008368 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008369 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
8371 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008374 enum PyUnicode_Kind kind;
8375 void *data;
8376 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 Py_ssize_t collstartpos = *inpos;
8379 Py_ssize_t collendpos = *inpos+1;
8380 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 char *encoding = "charmap";
8382 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008385 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386
Benjamin Petersonbac79492012-01-14 13:34:47 -05008387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 return -1;
8389 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 /* find all unencodable characters */
8391 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008392 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008393 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008395 val = encoding_map_lookup(ch, mapping);
8396 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 break;
8398 ++collendpos;
8399 continue;
8400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8403 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 if (rep==NULL)
8405 return -1;
8406 else if (rep!=Py_None) {
8407 Py_DECREF(rep);
8408 break;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413 /* cache callback name lookup
8414 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008415 if (*error_handler == _Py_ERROR_UNKNOWN)
8416 *error_handler = get_error_handler(errors);
8417
8418 switch (*error_handler) {
8419 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008420 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008422
8423 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 x = charmapencode_output('?', mapping, res, respos);
8426 if (x==enc_EXCEPTION) {
8427 return -1;
8428 }
8429 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008430 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return -1;
8432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 }
8434 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008435 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 *inpos = collendpos;
8437 break;
Victor Stinner50149202015-09-22 00:26:54 +02008438
8439 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 /* generate replacement (temporarily (mis)uses p) */
8441 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 char buffer[2+29+1+1];
8443 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008444 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 for (cp = buffer; *cp; ++cp) {
8446 x = charmapencode_output(*cp, mapping, res, respos);
8447 if (x==enc_EXCEPTION)
8448 return -1;
8449 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008450 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 }
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 *inpos = collendpos;
8456 break;
Victor Stinner50149202015-09-22 00:26:54 +02008457
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 default:
Victor Stinner50149202015-09-22 00:26:54 +02008459 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008464 if (PyBytes_Check(repunicode)) {
8465 /* Directly copy bytes result to output. */
8466 Py_ssize_t outsize = PyBytes_Size(*res);
8467 Py_ssize_t requiredsize;
8468 repsize = PyBytes_Size(repunicode);
8469 requiredsize = *respos + repsize;
8470 if (requiredsize > outsize)
8471 /* Make room for all additional bytes. */
8472 if (charmapencode_resize(res, respos, requiredsize)) {
8473 Py_DECREF(repunicode);
8474 return -1;
8475 }
8476 memcpy(PyBytes_AsString(*res) + *respos,
8477 PyBytes_AsString(repunicode), repsize);
8478 *respos += repsize;
8479 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008480 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008481 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008484 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008485 Py_DECREF(repunicode);
8486 return -1;
8487 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008488 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008489 data = PyUnicode_DATA(repunicode);
8490 kind = PyUnicode_KIND(repunicode);
8491 for (index = 0; index < repsize; index++) {
8492 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8493 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008495 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 return -1;
8497 }
8498 else if (x==enc_FAILED) {
8499 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008500 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
8502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 }
8504 *inpos = newpos;
8505 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 }
8507 return 0;
8508}
8509
Alexander Belopolsky40018472011-02-26 01:02:56 +00008510PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511_PyUnicode_EncodeCharmap(PyObject *unicode,
8512 PyObject *mapping,
8513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 /* output object */
8516 PyObject *res = NULL;
8517 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008518 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008521 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008522 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008524 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008525 void *data;
8526 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527
Benjamin Petersonbac79492012-01-14 13:34:47 -05008528 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529 return NULL;
8530 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008531 data = PyUnicode_DATA(unicode);
8532 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 /* Default to Latin-1 */
8535 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* allocate enough for a simple encoding without
8539 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008540 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 if (res == NULL)
8542 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008543 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008547 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 if (x==enc_EXCEPTION) /* error */
8551 goto onError;
8552 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008555 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 &res, &respos)) {
8557 goto onError;
8558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 else
8561 /* done with this character => adjust input position */
8562 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008566 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008567 if (_PyBytes_Resize(&res, respos) < 0)
8568 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008571 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 return res;
8573
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 Py_XDECREF(res);
8576 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008577 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 return NULL;
8579}
8580
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008581/* Deprecated */
8582PyObject *
8583PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8584 Py_ssize_t size,
8585 PyObject *mapping,
8586 const char *errors)
8587{
8588 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008589 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 if (unicode == NULL)
8591 return NULL;
8592 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8593 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008594 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008595}
8596
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597PyObject *
8598PyUnicode_AsCharmapString(PyObject *unicode,
8599 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600{
8601 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 PyErr_BadArgument();
8603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008605 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609static void
8610make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008612 Py_ssize_t startpos, Py_ssize_t endpos,
8613 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 *exceptionObject = _PyUnicodeTranslateError_Create(
8617 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
8619 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8621 goto onError;
8622 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8623 goto onError;
8624 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8625 goto onError;
8626 return;
8627 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008628 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630}
8631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632/* error handling callback helper:
8633 build arguments, call the callback and check the arguments,
8634 put the result into newpos and return the replacement string, which
8635 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static PyObject *
8637unicode_translate_call_errorhandler(const char *errors,
8638 PyObject **errorHandler,
8639 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641 Py_ssize_t startpos, Py_ssize_t endpos,
8642 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008644 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008646 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 PyObject *restuple;
8648 PyObject *resunicode;
8649
8650 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 }
8655
8656 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008661 restuple = PyObject_CallFunctionObjArgs(
8662 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008666 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 Py_DECREF(restuple);
8668 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008670 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 &resunicode, &i_newpos)) {
8672 Py_DECREF(restuple);
8673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008675 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008677 else
8678 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008680 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 Py_DECREF(restuple);
8682 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 Py_INCREF(resunicode);
8685 Py_DECREF(restuple);
8686 return resunicode;
8687}
8688
8689/* Lookup the character ch in the mapping and put the result in result,
8690 which must be decrefed by the caller.
8691 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694{
Christian Heimes217cfd12007-12-02 14:31:20 +00008695 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 PyObject *x;
8697
8698 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 x = PyObject_GetItem(mapping, w);
8701 Py_DECREF(w);
8702 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8704 /* No mapping found means: use 1:1 mapping. */
8705 PyErr_Clear();
8706 *result = NULL;
8707 return 0;
8708 } else
8709 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 }
8711 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 *result = x;
8713 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008715 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008717 if (value < 0 || value > MAX_UNICODE) {
8718 PyErr_Format(PyExc_ValueError,
8719 "character mapping must be in range(0x%x)",
8720 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(x);
8722 return -1;
8723 }
8724 *result = x;
8725 return 0;
8726 }
8727 else if (PyUnicode_Check(x)) {
8728 *result = x;
8729 return 0;
8730 }
8731 else {
8732 /* wrong return value */
8733 PyErr_SetString(PyExc_TypeError,
8734 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735 Py_DECREF(x);
8736 return -1;
8737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738}
Victor Stinner1194ea02014-04-04 19:37:40 +02008739
8740/* lookup the character, write the result into the writer.
8741 Return 1 if the result was written into the writer, return 0 if the mapping
8742 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008744charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8745 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746{
Victor Stinner1194ea02014-04-04 19:37:40 +02008747 PyObject *item;
8748
8749 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008751
8752 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008754 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008757 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008759
8760 if (item == Py_None) {
8761 Py_DECREF(item);
8762 return 0;
8763 }
8764
8765 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008766 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8767 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8768 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8770 Py_DECREF(item);
8771 return -1;
8772 }
8773 Py_DECREF(item);
8774 return 1;
8775 }
8776
8777 if (!PyUnicode_Check(item)) {
8778 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008780 }
8781
8782 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8783 Py_DECREF(item);
8784 return -1;
8785 }
8786
8787 Py_DECREF(item);
8788 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789}
8790
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791static int
8792unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8793 Py_UCS1 *translate)
8794{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008795 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008796 int ret = 0;
8797
Victor Stinner89a76ab2014-04-05 11:44:04 +02008798 if (charmaptranslate_lookup(ch, mapping, &item)) {
8799 return -1;
8800 }
8801
8802 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008803 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008804 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008806 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 /* not found => default to 1:1 mapping */
8808 translate[ch] = ch;
8809 return 1;
8810 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008811 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008812 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008813 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8814 used it */
8815 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 /* invalid character or character outside ASCII:
8817 skip the fast translate */
8818 goto exit;
8819 }
8820 translate[ch] = (Py_UCS1)replace;
8821 }
8822 else if (PyUnicode_Check(item)) {
8823 Py_UCS4 replace;
8824
8825 if (PyUnicode_READY(item) == -1) {
8826 Py_DECREF(item);
8827 return -1;
8828 }
8829 if (PyUnicode_GET_LENGTH(item) != 1)
8830 goto exit;
8831
8832 replace = PyUnicode_READ_CHAR(item, 0);
8833 if (replace > 127)
8834 goto exit;
8835 translate[ch] = (Py_UCS1)replace;
8836 }
8837 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008838 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 goto exit;
8840 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 ret = 1;
8842
Benjamin Peterson1365de72014-04-07 20:15:41 -04008843 exit:
8844 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 return ret;
8846}
8847
8848/* Fast path for ascii => ascii translation. Return 1 if the whole string
8849 was translated into writer, return 0 if the input string was partially
8850 translated into writer, raise an exception and return -1 on error. */
8851static int
8852unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008853 _PyUnicodeWriter *writer, int ignore,
8854 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008855{
Victor Stinner872b2912014-04-05 14:27:07 +02008856 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 Py_ssize_t len;
8858 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008859 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 len = PyUnicode_GET_LENGTH(input);
8862
Victor Stinner872b2912014-04-05 14:27:07 +02008863 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864
8865 in = PyUnicode_1BYTE_DATA(input);
8866 end = in + len;
8867
8868 assert(PyUnicode_IS_ASCII(writer->buffer));
8869 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8870 out = PyUnicode_1BYTE_DATA(writer->buffer);
8871
Victor Stinner872b2912014-04-05 14:27:07 +02008872 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008874 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008876 int translate = unicode_fast_translate_lookup(mapping, ch,
8877 ascii_table);
8878 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008880 if (translate == 0)
8881 goto exit;
8882 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 }
Victor Stinner872b2912014-04-05 14:27:07 +02008884 if (ch2 == 0xfe) {
8885 if (ignore)
8886 continue;
8887 goto exit;
8888 }
8889 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 }
Victor Stinner872b2912014-04-05 14:27:07 +02008893 res = 1;
8894
8895exit:
8896 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008897 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008898 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899}
8900
Victor Stinner3222da22015-10-01 22:07:32 +02008901static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902_PyUnicode_TranslateCharmap(PyObject *input,
8903 PyObject *mapping,
8904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008907 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 Py_ssize_t size, i;
8909 int kind;
8910 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008911 _PyUnicodeWriter writer;
8912 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913 char *reason = "character maps to <undefined>";
8914 PyObject *errorHandler = NULL;
8915 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008916 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 PyErr_BadArgument();
8921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 if (PyUnicode_READY(input) == -1)
8925 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 kind = PyUnicode_KIND(input);
8928 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008930 if (size == 0)
8931 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008933 /* allocate enough for a simple 1:1 translation without
8934 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008935 _PyUnicodeWriter_Init(&writer);
8936 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Victor Stinner872b2912014-04-05 14:27:07 +02008939 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8940
Victor Stinner33798672016-03-01 21:59:58 +01008941 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008942 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008943 if (PyUnicode_IS_ASCII(input)) {
8944 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8945 if (res < 0) {
8946 _PyUnicodeWriter_Dealloc(&writer);
8947 return NULL;
8948 }
8949 if (res == 1)
8950 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008951 }
Victor Stinner33798672016-03-01 21:59:58 +01008952 else {
8953 i = 0;
8954 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 int translate;
8959 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8960 Py_ssize_t newpos;
8961 /* startpos for collecting untranslatable chars */
8962 Py_ssize_t collstart;
8963 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 ch = PyUnicode_READ(kind, data, i);
8967 translate = charmaptranslate_output(ch, mapping, &writer);
8968 if (translate < 0)
8969 goto onError;
8970
8971 if (translate != 0) {
8972 /* it worked => adjust input pointer */
8973 ++i;
8974 continue;
8975 }
8976
8977 /* untranslatable character */
8978 collstart = i;
8979 collend = i+1;
8980
8981 /* find all untranslatable characters */
8982 while (collend < size) {
8983 PyObject *x;
8984 ch = PyUnicode_READ(kind, data, collend);
8985 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008986 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 Py_XDECREF(x);
8988 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 ++collend;
8991 }
8992
8993 if (ignore) {
8994 i = collend;
8995 }
8996 else {
8997 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8998 reason, input, &exc,
8999 collstart, collend, &newpos);
9000 if (repunicode == NULL)
9001 goto onError;
9002 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009004 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009005 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009006 Py_DECREF(repunicode);
9007 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 }
9009 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009010 Py_XDECREF(exc);
9011 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009015 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009016 Py_XDECREF(exc);
9017 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 return NULL;
9019}
9020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021/* Deprecated. Use PyUnicode_Translate instead. */
9022PyObject *
9023PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9024 Py_ssize_t size,
9025 PyObject *mapping,
9026 const char *errors)
9027{
Christian Heimes5f520f42012-09-11 14:03:25 +02009028 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009029 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (!unicode)
9031 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009032 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9033 Py_DECREF(unicode);
9034 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035}
9036
Alexander Belopolsky40018472011-02-26 01:02:56 +00009037PyObject *
9038PyUnicode_Translate(PyObject *str,
9039 PyObject *mapping,
9040 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009042 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009043 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009044 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045}
Tim Petersced69f82003-09-16 20:30:58 +00009046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009048fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049{
9050 /* No need to call PyUnicode_READY(self) because this function is only
9051 called as a callback from fixup() which does it already. */
9052 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9053 const int kind = PyUnicode_KIND(self);
9054 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009055 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009056 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 Py_ssize_t i;
9058
9059 for (i = 0; i < len; ++i) {
9060 ch = PyUnicode_READ(kind, data, i);
9061 fixed = 0;
9062 if (ch > 127) {
9063 if (Py_UNICODE_ISSPACE(ch))
9064 fixed = ' ';
9065 else {
9066 const int decimal = Py_UNICODE_TODECIMAL(ch);
9067 if (decimal >= 0)
9068 fixed = '0' + decimal;
9069 }
9070 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009071 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009072 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 PyUnicode_WRITE(kind, data, i, fixed);
9074 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009075 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009076 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 }
9079
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009080 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081}
9082
9083PyObject *
9084_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9085{
9086 if (!PyUnicode_Check(unicode)) {
9087 PyErr_BadInternalCall();
9088 return NULL;
9089 }
9090 if (PyUnicode_READY(unicode) == -1)
9091 return NULL;
9092 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9093 /* If the string is already ASCII, just return the same string */
9094 Py_INCREF(unicode);
9095 return unicode;
9096 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009097 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098}
9099
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009100PyObject *
9101PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9102 Py_ssize_t length)
9103{
Victor Stinnerf0124502011-11-21 23:12:56 +01009104 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009105 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009106 Py_UCS4 maxchar;
9107 enum PyUnicode_Kind kind;
9108 void *data;
9109
Victor Stinner99d7ad02012-02-22 13:37:39 +01009110 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009111 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009112 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009113 if (ch > 127) {
9114 int decimal = Py_UNICODE_TODECIMAL(ch);
9115 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009116 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009117 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118 }
9119 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009120
9121 /* Copy to a new string */
9122 decimal = PyUnicode_New(length, maxchar);
9123 if (decimal == NULL)
9124 return decimal;
9125 kind = PyUnicode_KIND(decimal);
9126 data = PyUnicode_DATA(decimal);
9127 /* Iterate over code points */
9128 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009129 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009130 if (ch > 127) {
9131 int decimal = Py_UNICODE_TODECIMAL(ch);
9132 if (decimal >= 0)
9133 ch = '0' + decimal;
9134 }
9135 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009137 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009138}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009139/* --- Decimal Encoder ---------------------------------------------------- */
9140
Alexander Belopolsky40018472011-02-26 01:02:56 +00009141int
9142PyUnicode_EncodeDecimal(Py_UNICODE *s,
9143 Py_ssize_t length,
9144 char *output,
9145 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009146{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009147 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009148 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009149 enum PyUnicode_Kind kind;
9150 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009151
9152 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 PyErr_BadArgument();
9154 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009155 }
9156
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009157 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009158 if (unicode == NULL)
9159 return -1;
9160
Victor Stinner42bf7752011-11-21 22:52:58 +01009161 kind = PyUnicode_KIND(unicode);
9162 data = PyUnicode_DATA(unicode);
9163
Victor Stinnerb84d7232011-11-22 01:50:07 +01009164 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009165 PyObject *exc;
9166 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009168 Py_ssize_t startpos;
9169
9170 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009171
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009173 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009174 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 decimal = Py_UNICODE_TODECIMAL(ch);
9178 if (decimal >= 0) {
9179 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009180 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 continue;
9182 }
9183 if (0 < ch && ch < 256) {
9184 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 continue;
9187 }
Victor Stinner6345be92011-11-25 20:09:01 +01009188
Victor Stinner42bf7752011-11-21 22:52:58 +01009189 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009190 exc = NULL;
9191 raise_encode_exception(&exc, "decimal", unicode,
9192 startpos, startpos+1,
9193 "invalid decimal Unicode string");
9194 Py_XDECREF(exc);
9195 Py_DECREF(unicode);
9196 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009197 }
9198 /* 0-terminate the output string */
9199 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009200 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009201 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202}
9203
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204/* --- Helpers ------------------------------------------------------------ */
9205
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009206/* helper macro to fixup start/end slice values */
9207#define ADJUST_INDICES(start, end, len) \
9208 if (end > len) \
9209 end = len; \
9210 else if (end < 0) { \
9211 end += len; \
9212 if (end < 0) \
9213 end = 0; \
9214 } \
9215 if (start < 0) { \
9216 start += len; \
9217 if (start < 0) \
9218 start = 0; \
9219 }
9220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009222any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009224 Py_ssize_t end,
9225 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009227 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 void *buf1, *buf2;
9229 Py_ssize_t len1, len2, result;
9230
9231 kind1 = PyUnicode_KIND(s1);
9232 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009233 if (kind1 < kind2)
9234 return -1;
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 len1 = PyUnicode_GET_LENGTH(s1);
9237 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009238 ADJUST_INDICES(start, end, len1);
9239 if (end - start < len2)
9240 return -1;
9241
9242 buf1 = PyUnicode_DATA(s1);
9243 buf2 = PyUnicode_DATA(s2);
9244 if (len2 == 1) {
9245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9246 result = findchar((const char *)buf1 + kind1*start,
9247 kind1, end - start, ch, direction);
9248 if (result == -1)
9249 return -1;
9250 else
9251 return start + result;
9252 }
9253
9254 if (kind2 != kind1) {
9255 buf2 = _PyUnicode_AsKind(s2, kind1);
9256 if (!buf2)
9257 return -2;
9258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259
Victor Stinner794d5672011-10-10 03:21:36 +02009260 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009261 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009262 case PyUnicode_1BYTE_KIND:
9263 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9264 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9265 else
9266 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9267 break;
9268 case PyUnicode_2BYTE_KIND:
9269 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9270 break;
9271 case PyUnicode_4BYTE_KIND:
9272 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9273 break;
9274 default:
9275 assert(0); result = -2;
9276 }
9277 }
9278 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
9293 assert(0); result = -2;
9294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 }
9296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 PyMem_Free(buf2);
9299
9300 return result;
9301}
9302
9303Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009304_PyUnicode_InsertThousandsGrouping(
9305 PyObject *unicode, Py_ssize_t index,
9306 Py_ssize_t n_buffer,
9307 void *digits, Py_ssize_t n_digits,
9308 Py_ssize_t min_width,
9309 const char *grouping, PyObject *thousands_sep,
9310 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311{
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009313 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 Py_ssize_t thousands_sep_len;
9315 Py_ssize_t len;
9316
9317 if (unicode != NULL) {
9318 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009319 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 }
9321 else {
9322 kind = PyUnicode_1BYTE_KIND;
9323 data = NULL;
9324 }
9325 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9326 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9327 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9328 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009329 if (thousands_sep_kind < kind) {
9330 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9331 if (!thousands_sep_data)
9332 return -1;
9333 }
9334 else {
9335 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9336 if (!data)
9337 return -1;
9338 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340
Benjamin Petersonead6b532011-12-20 17:23:42 -06009341 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009343 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009344 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009345 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009347 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009348 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009349 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009350 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009352 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009356 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009359 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009361 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009362 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 break;
9366 default:
9367 assert(0);
9368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009370 if (unicode != NULL && thousands_sep_kind != kind) {
9371 if (thousands_sep_kind < kind)
9372 PyMem_Free(thousands_sep_data);
9373 else
9374 PyMem_Free(data);
9375 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 if (unicode == NULL) {
9377 *maxchar = 127;
9378 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009379 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009380 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 }
9382 }
9383 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384}
9385
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387Py_ssize_t
9388PyUnicode_Count(PyObject *str,
9389 PyObject *substr,
9390 Py_ssize_t start,
9391 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009393 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009394 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 void *buf1 = NULL, *buf2 = NULL;
9396 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009398 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009401 kind1 = PyUnicode_KIND(str);
9402 kind2 = PyUnicode_KIND(substr);
9403 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009406 len1 = PyUnicode_GET_LENGTH(str);
9407 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009410 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 buf1 = PyUnicode_DATA(str);
9413 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009414 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 if (!buf2)
9417 goto onError;
9418 }
9419
9420 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009422 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009423 result = asciilib_count(
9424 ((Py_UCS1*)buf1) + start, end - start,
9425 buf2, len2, PY_SSIZE_T_MAX
9426 );
9427 else
9428 result = ucs1lib_count(
9429 ((Py_UCS1*)buf1) + start, end - start,
9430 buf2, len2, PY_SSIZE_T_MAX
9431 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 break;
9433 case PyUnicode_2BYTE_KIND:
9434 result = ucs2lib_count(
9435 ((Py_UCS2*)buf1) + start, end - start,
9436 buf2, len2, PY_SSIZE_T_MAX
9437 );
9438 break;
9439 case PyUnicode_4BYTE_KIND:
9440 result = ucs4lib_count(
9441 ((Py_UCS4*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 break;
9445 default:
9446 assert(0); result = 0;
9447 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyMem_Free(buf2);
9451
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009454 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 PyMem_Free(buf2);
9456 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459Py_ssize_t
9460PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009461 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009462 Py_ssize_t start,
9463 Py_ssize_t end,
9464 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009466 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009468
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009469 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472Py_ssize_t
9473PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9474 Py_ssize_t start, Py_ssize_t end,
9475 int direction)
9476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009478 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (PyUnicode_READY(str) == -1)
9480 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009481 len = PyUnicode_GET_LENGTH(str);
9482 ADJUST_INDICES(start, end, len);
9483 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009484 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009486 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9487 kind, end-start, ch, direction);
9488 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009490 else
9491 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492}
9493
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009495tailmatch(PyObject *self,
9496 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009497 Py_ssize_t start,
9498 Py_ssize_t end,
9499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 int kind_self;
9502 int kind_sub;
9503 void *data_self;
9504 void *data_sub;
9505 Py_ssize_t offset;
9506 Py_ssize_t i;
9507 Py_ssize_t end_sub;
9508
9509 if (PyUnicode_READY(self) == -1 ||
9510 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9514 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009518 if (PyUnicode_GET_LENGTH(substring) == 0)
9519 return 1;
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 kind_self = PyUnicode_KIND(self);
9522 data_self = PyUnicode_DATA(self);
9523 kind_sub = PyUnicode_KIND(substring);
9524 data_sub = PyUnicode_DATA(substring);
9525 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9526
9527 if (direction > 0)
9528 offset = end;
9529 else
9530 offset = start;
9531
9532 if (PyUnicode_READ(kind_self, data_self, offset) ==
9533 PyUnicode_READ(kind_sub, data_sub, 0) &&
9534 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9535 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9536 /* If both are of the same kind, memcmp is sufficient */
9537 if (kind_self == kind_sub) {
9538 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009539 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 data_sub,
9541 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009542 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009544 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 else {
9546 /* We do not need to compare 0 and len(substring)-1 because
9547 the if statement above ensured already that they are equal
9548 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 for (i = 1; i < end_sub; ++i) {
9550 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9551 PyUnicode_READ(kind_sub, data_sub, i))
9552 return 0;
9553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 }
9557
9558 return 0;
9559}
9560
Alexander Belopolsky40018472011-02-26 01:02:56 +00009561Py_ssize_t
9562PyUnicode_Tailmatch(PyObject *str,
9563 PyObject *substr,
9564 Py_ssize_t start,
9565 Py_ssize_t end,
9566 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009568 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009571 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572}
9573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574/* Apply fixfct filter to the Unicode object self and return a
9575 reference to the modified object */
9576
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009578fixup(PyObject *self,
9579 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 PyObject *u;
9582 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009583 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009585 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009588 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 /* fix functions return the new maximum character in a string,
9591 if the kind of the resulting unicode object does not change,
9592 everything is fine. Otherwise we need to change the string kind
9593 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009594 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009595
9596 if (maxchar_new == 0) {
9597 /* no changes */;
9598 if (PyUnicode_CheckExact(self)) {
9599 Py_DECREF(u);
9600 Py_INCREF(self);
9601 return self;
9602 }
9603 else
9604 return u;
9605 }
9606
Victor Stinnere6abb482012-05-02 01:15:40 +02009607 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608
Victor Stinnereaab6042011-12-11 22:22:39 +01009609 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009611
9612 /* In case the maximum character changed, we need to
9613 convert the string to the new category. */
9614 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9615 if (v == NULL) {
9616 Py_DECREF(u);
9617 return NULL;
9618 }
9619 if (maxchar_new > maxchar_old) {
9620 /* If the maxchar increased so that the kind changed, not all
9621 characters are representable anymore and we need to fix the
9622 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009623 _PyUnicode_FastCopyCharacters(v, 0,
9624 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009625 maxchar_old = fixfct(v);
9626 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 }
9628 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009629 _PyUnicode_FastCopyCharacters(v, 0,
9630 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009632 Py_DECREF(u);
9633 assert(_PyUnicode_CheckConsistency(v, 1));
9634 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9641 char *resdata, *data = PyUnicode_DATA(self);
9642 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009643
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 res = PyUnicode_New(len, 127);
9645 if (res == NULL)
9646 return NULL;
9647 resdata = PyUnicode_DATA(res);
9648 if (lower)
9649 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 _Py_bytes_upper(resdata, data, len);
9652 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t j;
9659 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009660 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9664
9665 where ! is a negation and \p{xxx} is a character with property xxx.
9666 */
9667 for (j = i - 1; j >= 0; j--) {
9668 c = PyUnicode_READ(kind, data, j);
9669 if (!_PyUnicode_IsCaseIgnorable(c))
9670 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9673 if (final_sigma) {
9674 for (j = i + 1; j < length; j++) {
9675 c = PyUnicode_READ(kind, data, j);
9676 if (!_PyUnicode_IsCaseIgnorable(c))
9677 break;
9678 }
9679 final_sigma = j == length || !_PyUnicode_IsCased(c);
9680 }
9681 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682}
9683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684static int
9685lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9686 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 /* Obscure special case. */
9689 if (c == 0x3A3) {
9690 mapped[0] = handle_capital_sigma(kind, data, length, i);
9691 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696static Py_ssize_t
9697do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 Py_ssize_t i, k = 0;
9700 int n_res, j;
9701 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009702
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 c = PyUnicode_READ(kind, data, 0);
9704 n_res = _PyUnicode_ToUpperFull(c, mapped);
9705 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009706 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 for (i = 1; i < length; i++) {
9710 c = PyUnicode_READ(kind, data, i);
9711 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009713 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009715 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009716 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720static Py_ssize_t
9721do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9722 Py_ssize_t i, k = 0;
9723
9724 for (i = 0; i < length; i++) {
9725 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9726 int n_res, j;
9727 if (Py_UNICODE_ISUPPER(c)) {
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 }
9730 else if (Py_UNICODE_ISLOWER(c)) {
9731 n_res = _PyUnicode_ToUpperFull(c, mapped);
9732 }
9733 else {
9734 n_res = 1;
9735 mapped[0] = c;
9736 }
9737 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009738 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 res[k++] = mapped[j];
9740 }
9741 }
9742 return k;
9743}
9744
9745static Py_ssize_t
9746do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9747 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 Py_ssize_t i, k = 0;
9750
9751 for (i = 0; i < length; i++) {
9752 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9753 int n_res, j;
9754 if (lower)
9755 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9756 else
9757 n_res = _PyUnicode_ToUpperFull(c, mapped);
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
9761 }
9762 }
9763 return k;
9764}
9765
9766static Py_ssize_t
9767do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768{
9769 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9770}
9771
9772static Py_ssize_t
9773do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9774{
9775 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9776}
9777
Benjamin Petersone51757f2012-01-12 21:10:29 -05009778static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009779do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780{
9781 Py_ssize_t i, k = 0;
9782
9783 for (i = 0; i < length; i++) {
9784 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785 Py_UCS4 mapped[3];
9786 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9787 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009788 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009789 res[k++] = mapped[j];
9790 }
9791 }
9792 return k;
9793}
9794
9795static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799 int previous_is_cased;
9800
9801 previous_is_cased = 0;
9802 for (i = 0; i < length; i++) {
9803 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804 Py_UCS4 mapped[3];
9805 int n_res, j;
9806
9807 if (previous_is_cased)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToTitleFull(c, mapped);
9811
9812 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009813 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814 res[k++] = mapped[j];
9815 }
9816
9817 previous_is_cased = _PyUnicode_IsCased(c);
9818 }
9819 return k;
9820}
9821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822static PyObject *
9823case_operation(PyObject *self,
9824 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825{
9826 PyObject *res = NULL;
9827 Py_ssize_t length, newlength = 0;
9828 int kind, outkind;
9829 void *data, *outdata;
9830 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831
Benjamin Petersoneea48462012-01-16 14:28:50 -05009832 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833
9834 kind = PyUnicode_KIND(self);
9835 data = PyUnicode_DATA(self);
9836 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009837 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009838 PyErr_SetString(PyExc_OverflowError, "string is too long");
9839 return NULL;
9840 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009841 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842 if (tmp == NULL)
9843 return PyErr_NoMemory();
9844 newlength = perform(kind, data, length, tmp, &maxchar);
9845 res = PyUnicode_New(newlength, maxchar);
9846 if (res == NULL)
9847 goto leave;
9848 tmpend = tmp + newlength;
9849 outdata = PyUnicode_DATA(res);
9850 outkind = PyUnicode_KIND(res);
9851 switch (outkind) {
9852 case PyUnicode_1BYTE_KIND:
9853 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9854 break;
9855 case PyUnicode_2BYTE_KIND:
9856 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9857 break;
9858 case PyUnicode_4BYTE_KIND:
9859 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9860 break;
9861 default:
9862 assert(0);
9863 break;
9864 }
9865 leave:
9866 PyMem_FREE(tmp);
9867 return res;
9868}
9869
Tim Peters8ce9f162004-08-27 01:49:32 +00009870PyObject *
9871PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009873 PyObject *res;
9874 PyObject *fseq;
9875 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009876 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009878 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009879 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009881 }
9882
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 /* NOTE: the following code can't call back into Python code,
9884 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009885 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009886
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009887 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 res = _PyUnicode_JoinArray(separator, items, seqlen);
9890 Py_DECREF(fseq);
9891 return res;
9892}
9893
9894PyObject *
9895_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9896{
9897 PyObject *res = NULL; /* the result */
9898 PyObject *sep = NULL;
9899 Py_ssize_t seplen;
9900 PyObject *item;
9901 Py_ssize_t sz, i, res_offset;
9902 Py_UCS4 maxchar;
9903 Py_UCS4 item_maxchar;
9904 int use_memcpy;
9905 unsigned char *res_data = NULL, *sep_data = NULL;
9906 PyObject *last_obj;
9907 unsigned int kind = 0;
9908
Tim Peters05eba1f2004-08-27 21:32:02 +00009909 /* If empty sequence, return u"". */
9910 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009911 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009912 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009913
Tim Peters05eba1f2004-08-27 21:32:02 +00009914 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009915 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 if (seqlen == 1) {
9917 if (PyUnicode_CheckExact(items[0])) {
9918 res = items[0];
9919 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 return res;
9921 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009922 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009923 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009924 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009926 /* Set up sep and seplen */
9927 if (separator == NULL) {
9928 /* fall back to a blank space separator */
9929 sep = PyUnicode_FromOrdinal(' ');
9930 if (!sep)
9931 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009933 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 else {
9936 if (!PyUnicode_Check(separator)) {
9937 PyErr_Format(PyExc_TypeError,
9938 "separator: expected str instance,"
9939 " %.80s found",
9940 Py_TYPE(separator)->tp_name);
9941 goto onError;
9942 }
9943 if (PyUnicode_READY(separator))
9944 goto onError;
9945 sep = separator;
9946 seplen = PyUnicode_GET_LENGTH(separator);
9947 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9948 /* inc refcount to keep this code path symmetric with the
9949 above case of a blank separator */
9950 Py_INCREF(sep);
9951 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009952 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
9954
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009955 /* There are at least two things to join, or else we have a subclass
9956 * of str in the sequence.
9957 * Do a pre-pass to figure out the total amount of space we'll
9958 * need (sz), and see whether all argument are strings.
9959 */
9960 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009961#ifdef Py_DEBUG
9962 use_memcpy = 0;
9963#else
9964 use_memcpy = 1;
9965#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009967 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 if (!PyUnicode_Check(item)) {
9970 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009971 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 " %.80s found",
9973 i, Py_TYPE(item)->tp_name);
9974 goto onError;
9975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_READY(item) == -1)
9977 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009978 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009980 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009981 if (i != 0) {
9982 add_sz += seplen;
9983 }
9984 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 goto onError;
9988 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009989 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009990 if (use_memcpy && last_obj != NULL) {
9991 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9992 use_memcpy = 0;
9993 }
9994 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 }
Tim Petersced69f82003-09-16 20:30:58 +00009996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 if (res == NULL)
9999 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010000
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010002#ifdef Py_DEBUG
10003 use_memcpy = 0;
10004#else
10005 if (use_memcpy) {
10006 res_data = PyUnicode_1BYTE_DATA(res);
10007 kind = PyUnicode_KIND(res);
10008 if (seplen != 0)
10009 sep_data = PyUnicode_1BYTE_DATA(sep);
10010 }
10011#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010012 if (use_memcpy) {
10013 for (i = 0; i < seqlen; ++i) {
10014 Py_ssize_t itemlen;
10015 item = items[i];
10016
10017 /* Copy item, and maybe the separator. */
10018 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010019 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010020 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010021 kind * seplen);
10022 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010023 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010024
10025 itemlen = PyUnicode_GET_LENGTH(item);
10026 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010027 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010029 kind * itemlen);
10030 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010032 }
10033 assert(res_data == PyUnicode_1BYTE_DATA(res)
10034 + kind * PyUnicode_GET_LENGTH(res));
10035 }
10036 else {
10037 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10038 Py_ssize_t itemlen;
10039 item = items[i];
10040
10041 /* Copy item, and maybe the separator. */
10042 if (i && seplen != 0) {
10043 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10044 res_offset += seplen;
10045 }
10046
10047 itemlen = PyUnicode_GET_LENGTH(item);
10048 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010049 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 res_offset += itemlen;
10051 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010052 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010054 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010057 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059
Benjamin Peterson29060642009-01-31 22:14:21 +000010060 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010062 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063 return NULL;
10064}
10065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066#define FILL(kind, data, value, start, length) \
10067 do { \
10068 Py_ssize_t i_ = 0; \
10069 assert(kind != PyUnicode_WCHAR_KIND); \
10070 switch ((kind)) { \
10071 case PyUnicode_1BYTE_KIND: { \
10072 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010073 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 break; \
10075 } \
10076 case PyUnicode_2BYTE_KIND: { \
10077 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10078 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10079 break; \
10080 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010081 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10083 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10084 break; \
10085 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010086 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 } \
10088 } while (0)
10089
Victor Stinnerd3f08822012-05-29 12:57:52 +020010090void
10091_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10092 Py_UCS4 fill_char)
10093{
10094 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10095 const void *data = PyUnicode_DATA(unicode);
10096 assert(PyUnicode_IS_READY(unicode));
10097 assert(unicode_modifiable(unicode));
10098 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10099 assert(start >= 0);
10100 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10101 FILL(kind, data, fill_char, start, length);
10102}
10103
Victor Stinner3fe55312012-01-04 00:33:50 +010010104Py_ssize_t
10105PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10106 Py_UCS4 fill_char)
10107{
10108 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010109
10110 if (!PyUnicode_Check(unicode)) {
10111 PyErr_BadInternalCall();
10112 return -1;
10113 }
10114 if (PyUnicode_READY(unicode) == -1)
10115 return -1;
10116 if (unicode_check_modifiable(unicode))
10117 return -1;
10118
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119 if (start < 0) {
10120 PyErr_SetString(PyExc_IndexError, "string index out of range");
10121 return -1;
10122 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010123 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10124 PyErr_SetString(PyExc_ValueError,
10125 "fill character is bigger than "
10126 "the string maximum character");
10127 return -1;
10128 }
10129
10130 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10131 length = Py_MIN(maxlen, length);
10132 if (length <= 0)
10133 return 0;
10134
Victor Stinnerd3f08822012-05-29 12:57:52 +020010135 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010136 return length;
10137}
10138
Victor Stinner9310abb2011-10-05 00:59:23 +020010139static PyObject *
10140pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010141 Py_ssize_t left,
10142 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 PyObject *u;
10146 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010147 int kind;
10148 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
10150 if (left < 0)
10151 left = 0;
10152 if (right < 0)
10153 right = 0;
10154
Victor Stinnerc4b49542011-12-11 22:44:26 +010010155 if (left == 0 && right == 0)
10156 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10159 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010160 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10161 return NULL;
10162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010164 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010166 if (!u)
10167 return NULL;
10168
10169 kind = PyUnicode_KIND(u);
10170 data = PyUnicode_DATA(u);
10171 if (left)
10172 FILL(kind, data, fill, 0, left);
10173 if (right)
10174 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010175 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010176 assert(_PyUnicode_CheckConsistency(u, 1));
10177 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178}
10179
Alexander Belopolsky40018472011-02-26 01:02:56 +000010180PyObject *
10181PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010185 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187
Benjamin Petersonead6b532011-12-20 17:23:42 -060010188 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 if (PyUnicode_IS_ASCII(string))
10191 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(string), keepends);
10194 else
10195 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 break;
10199 case PyUnicode_2BYTE_KIND:
10200 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(string), keepends);
10203 break;
10204 case PyUnicode_4BYTE_KIND:
10205 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 PyUnicode_GET_LENGTH(string), keepends);
10208 break;
10209 default:
10210 assert(0);
10211 list = 0;
10212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214}
10215
Alexander Belopolsky40018472011-02-26 01:02:56 +000010216static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010217split(PyObject *self,
10218 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010219 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010221 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 void *buf1, *buf2;
10223 Py_ssize_t len1, len2;
10224 PyObject* out;
10225
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010227 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (PyUnicode_READY(self) == -1)
10230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010233 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 if (PyUnicode_IS_ASCII(self))
10236 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010238 PyUnicode_GET_LENGTH(self), maxcount
10239 );
10240 else
10241 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010242 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010243 PyUnicode_GET_LENGTH(self), maxcount
10244 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_2BYTE_KIND:
10246 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 PyUnicode_GET_LENGTH(self), maxcount
10249 );
10250 case PyUnicode_4BYTE_KIND:
10251 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyUnicode_GET_LENGTH(self), maxcount
10254 );
10255 default:
10256 assert(0);
10257 return NULL;
10258 }
10259
10260 if (PyUnicode_READY(substring) == -1)
10261 return NULL;
10262
10263 kind1 = PyUnicode_KIND(self);
10264 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 len1 = PyUnicode_GET_LENGTH(self);
10266 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010267 if (kind1 < kind2 || len1 < len2) {
10268 out = PyList_New(1);
10269 if (out == NULL)
10270 return NULL;
10271 Py_INCREF(self);
10272 PyList_SET_ITEM(out, 0, self);
10273 return out;
10274 }
10275 buf1 = PyUnicode_DATA(self);
10276 buf2 = PyUnicode_DATA(substring);
10277 if (kind2 != kind1) {
10278 buf2 = _PyUnicode_AsKind(substring, kind1);
10279 if (!buf2)
10280 return NULL;
10281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010283 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10286 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 else
10289 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 break;
10292 case PyUnicode_2BYTE_KIND:
10293 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 break;
10296 case PyUnicode_4BYTE_KIND:
10297 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 break;
10300 default:
10301 out = NULL;
10302 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 PyMem_Free(buf2);
10305 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010309rsplit(PyObject *self,
10310 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010311 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010312{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010313 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 void *buf1, *buf2;
10315 Py_ssize_t len1, len2;
10316 PyObject* out;
10317
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010318 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010319 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (PyUnicode_READY(self) == -1)
10322 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010325 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 if (PyUnicode_IS_ASCII(self))
10328 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 else
10333 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_2BYTE_KIND:
10338 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 PyUnicode_GET_LENGTH(self), maxcount
10341 );
10342 case PyUnicode_4BYTE_KIND:
10343 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
10347 default:
10348 assert(0);
10349 return NULL;
10350 }
10351
10352 if (PyUnicode_READY(substring) == -1)
10353 return NULL;
10354
10355 kind1 = PyUnicode_KIND(self);
10356 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 len1 = PyUnicode_GET_LENGTH(self);
10358 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind1 < kind2 || len1 < len2) {
10360 out = PyList_New(1);
10361 if (out == NULL)
10362 return NULL;
10363 Py_INCREF(self);
10364 PyList_SET_ITEM(out, 0, self);
10365 return out;
10366 }
10367 buf1 = PyUnicode_DATA(self);
10368 buf2 = PyUnicode_DATA(substring);
10369 if (kind2 != kind1) {
10370 buf2 = _PyUnicode_AsKind(substring, kind1);
10371 if (!buf2)
10372 return NULL;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010375 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10378 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 else
10381 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_2BYTE_KIND:
10385 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 case PyUnicode_4BYTE_KIND:
10389 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 default:
10393 out = NULL;
10394 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyMem_Free(buf2);
10397 return out;
10398}
10399
10400static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010401anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10402 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010404 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010406 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10407 return asciilib_find(buf1, len1, buf2, len2, offset);
10408 else
10409 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 case PyUnicode_2BYTE_KIND:
10411 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10412 case PyUnicode_4BYTE_KIND:
10413 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10414 }
10415 assert(0);
10416 return -1;
10417}
10418
10419static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10421 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010423 switch (kind) {
10424 case PyUnicode_1BYTE_KIND:
10425 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10426 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10427 else
10428 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10431 case PyUnicode_4BYTE_KIND:
10432 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10433 }
10434 assert(0);
10435 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010436}
10437
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010438static void
10439replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10440 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10441{
10442 int kind = PyUnicode_KIND(u);
10443 void *data = PyUnicode_DATA(u);
10444 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10445 if (kind == PyUnicode_1BYTE_KIND) {
10446 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10447 (Py_UCS1 *)data + len,
10448 u1, u2, maxcount);
10449 }
10450 else if (kind == PyUnicode_2BYTE_KIND) {
10451 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10452 (Py_UCS2 *)data + len,
10453 u1, u2, maxcount);
10454 }
10455 else {
10456 assert(kind == PyUnicode_4BYTE_KIND);
10457 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10458 (Py_UCS4 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461}
10462
Alexander Belopolsky40018472011-02-26 01:02:56 +000010463static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464replace(PyObject *self, PyObject *str1,
10465 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 PyObject *u;
10468 char *sbuf = PyUnicode_DATA(self);
10469 char *buf1 = PyUnicode_DATA(str1);
10470 char *buf2 = PyUnicode_DATA(str2);
10471 int srelease = 0, release1 = 0, release2 = 0;
10472 int skind = PyUnicode_KIND(self);
10473 int kind1 = PyUnicode_KIND(str1);
10474 int kind2 = PyUnicode_KIND(str2);
10475 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10476 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10477 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010479 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010482 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010484 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485
Victor Stinner59de0ee2011-10-07 10:01:28 +020010486 if (str1 == str2)
10487 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10491 if (maxchar < maxchar_str1)
10492 /* substring too wide to be present */
10493 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10495 /* Replacing str1 with str2 may cause a maxchar reduction in the
10496 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010498 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010503 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010507 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010508
Victor Stinner69ed0f42013-04-09 21:48:24 +020010509 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010510 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010511 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010513 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010517
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10519 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 }
10521 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 int rkind = skind;
10523 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010524 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (kind1 < rkind) {
10527 /* widen substring */
10528 buf1 = _PyUnicode_AsKind(str1, rkind);
10529 if (!buf1) goto error;
10530 release1 = 1;
10531 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010532 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 if (i < 0)
10534 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (rkind > kind2) {
10536 /* widen replacement */
10537 buf2 = _PyUnicode_AsKind(str2, rkind);
10538 if (!buf2) goto error;
10539 release2 = 1;
10540 }
10541 else if (rkind < kind2) {
10542 /* widen self and buf1 */
10543 rkind = kind2;
10544 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010545 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 sbuf = _PyUnicode_AsKind(self, rkind);
10547 if (!sbuf) goto error;
10548 srelease = 1;
10549 buf1 = _PyUnicode_AsKind(str1, rkind);
10550 if (!buf1) goto error;
10551 release1 = 1;
10552 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010553 u = PyUnicode_New(slen, maxchar);
10554 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010556 assert(PyUnicode_KIND(u) == rkind);
10557 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010558
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010560 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010563 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010565
10566 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010567 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010568 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010569 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010570 if (i == -1)
10571 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 }
10579 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010581 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 int rkind = skind;
10583 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 buf1 = _PyUnicode_AsKind(str1, rkind);
10588 if (!buf1) goto error;
10589 release1 = 1;
10590 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 if (n == 0)
10593 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf2 = _PyUnicode_AsKind(str2, rkind);
10597 if (!buf2) goto error;
10598 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 rkind = kind2;
10603 sbuf = _PyUnicode_AsKind(self, rkind);
10604 if (!sbuf) goto error;
10605 srelease = 1;
10606 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010607 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf1 = _PyUnicode_AsKind(str1, rkind);
10609 if (!buf1) goto error;
10610 release1 = 1;
10611 }
10612 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10613 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010614 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 PyErr_SetString(PyExc_OverflowError,
10616 "replace string is too long");
10617 goto error;
10618 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010619 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010621 _Py_INCREF_UNICODE_EMPTY();
10622 if (!unicode_empty)
10623 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010624 u = unicode_empty;
10625 goto done;
10626 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010627 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 PyErr_SetString(PyExc_OverflowError,
10629 "replace string is too long");
10630 goto error;
10631 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 u = PyUnicode_New(new_size, maxchar);
10633 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 assert(PyUnicode_KIND(u) == rkind);
10636 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 ires = i = 0;
10638 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 while (n-- > 0) {
10640 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010641 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010643 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010644 if (j == -1)
10645 break;
10646 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
10649 sbuf + rkind * i,
10650 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 }
10653 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
10665 sbuf + rkind * i,
10666 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010667 }
10668 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 /* interleave */
10670 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 if (--n <= 0)
10676 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 memcpy(res + rkind * ires,
10678 sbuf + rkind * i,
10679 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 ires++;
10681 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
10684 sbuf + rkind * i,
10685 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 }
10688
10689 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010690 unicode_adjust_maxchar(&u);
10691 if (u == NULL)
10692 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010694
10695 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (srelease)
10697 PyMem_FREE(sbuf);
10698 if (release1)
10699 PyMem_FREE(buf1);
10700 if (release2)
10701 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010702 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010713 return unicode_result_unchanged(self);
10714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 error:
10716 if (srelease && sbuf)
10717 PyMem_FREE(sbuf);
10718 if (release1 && buf1)
10719 PyMem_FREE(buf1);
10720 if (release2 && buf2)
10721 PyMem_FREE(buf2);
10722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
10725/* --- Unicode Object Methods --------------------------------------------- */
10726
INADA Naoki3ae20562017-01-16 20:41:20 +090010727/*[clinic input]
10728str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
INADA Naoki3ae20562017-01-16 20:41:20 +090010730Return a version of the string where each word is titlecased.
10731
10732More specifically, words start with uppercased characters and all remaining
10733cased characters have lower case.
10734[clinic start generated code]*/
10735
10736static PyObject *
10737unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010738/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010740 if (PyUnicode_READY(self) == -1)
10741 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010742 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743}
10744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745/*[clinic input]
10746str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
INADA Naoki3ae20562017-01-16 20:41:20 +090010748Return a capitalized version of the string.
10749
10750More specifically, make the first character have upper case and the rest lower
10751case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_capitalize_impl(PyObject *self)
10756/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 if (PyUnicode_GET_LENGTH(self) == 0)
10761 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010762 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763}
10764
INADA Naoki3ae20562017-01-16 20:41:20 +090010765/*[clinic input]
10766str.casefold as unicode_casefold
10767
10768Return a version of the string suitable for caseless comparisons.
10769[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010770
10771static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010772unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010773/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010774{
10775 if (PyUnicode_READY(self) == -1)
10776 return NULL;
10777 if (PyUnicode_IS_ASCII(self))
10778 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010779 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010780}
10781
10782
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010783/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010784
10785static int
10786convert_uc(PyObject *obj, void *addr)
10787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010789
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010790 if (!PyUnicode_Check(obj)) {
10791 PyErr_Format(PyExc_TypeError,
10792 "The fill character must be a unicode character, "
10793 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 return 0;
10795 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010796 if (PyUnicode_READY(obj) < 0)
10797 return 0;
10798 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010799 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 return 0;
10802 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010803 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010805}
10806
INADA Naoki3ae20562017-01-16 20:41:20 +090010807/*[clinic input]
10808str.center as unicode_center
10809
10810 width: Py_ssize_t
10811 fillchar: Py_UCS4 = ' '
10812 /
10813
10814Return a centered string of length width.
10815
10816Padding is done using the specified fill character (default is a space).
10817[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
10819static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010820unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10821/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010823 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
Benjamin Petersonbac79492012-01-14 13:34:47 -050010825 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 return NULL;
10827
Victor Stinnerc4b49542011-12-11 22:44:26 +010010828 if (PyUnicode_GET_LENGTH(self) >= width)
10829 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Victor Stinnerc4b49542011-12-11 22:44:26 +010010831 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 left = marg / 2 + (marg & width & 1);
10833
Victor Stinner9310abb2011-10-05 00:59:23 +020010834 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835}
10836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837/* This function assumes that str1 and str2 are readied by the caller. */
10838
Marc-André Lemburge5034372000-08-08 08:04:29 +000010839static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010840unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010841{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842#define COMPARE(TYPE1, TYPE2) \
10843 do { \
10844 TYPE1* p1 = (TYPE1 *)data1; \
10845 TYPE2* p2 = (TYPE2 *)data2; \
10846 TYPE1* end = p1 + len; \
10847 Py_UCS4 c1, c2; \
10848 for (; p1 != end; p1++, p2++) { \
10849 c1 = *p1; \
10850 c2 = *p2; \
10851 if (c1 != c2) \
10852 return (c1 < c2) ? -1 : 1; \
10853 } \
10854 } \
10855 while (0)
10856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 int kind1, kind2;
10858 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 kind1 = PyUnicode_KIND(str1);
10862 kind2 = PyUnicode_KIND(str2);
10863 data1 = PyUnicode_DATA(str1);
10864 data2 = PyUnicode_DATA(str2);
10865 len1 = PyUnicode_GET_LENGTH(str1);
10866 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010867 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010868
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010869 switch(kind1) {
10870 case PyUnicode_1BYTE_KIND:
10871 {
10872 switch(kind2) {
10873 case PyUnicode_1BYTE_KIND:
10874 {
10875 int cmp = memcmp(data1, data2, len);
10876 /* normalize result of memcmp() into the range [-1; 1] */
10877 if (cmp < 0)
10878 return -1;
10879 if (cmp > 0)
10880 return 1;
10881 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010882 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 case PyUnicode_2BYTE_KIND:
10884 COMPARE(Py_UCS1, Py_UCS2);
10885 break;
10886 case PyUnicode_4BYTE_KIND:
10887 COMPARE(Py_UCS1, Py_UCS4);
10888 break;
10889 default:
10890 assert(0);
10891 }
10892 break;
10893 }
10894 case PyUnicode_2BYTE_KIND:
10895 {
10896 switch(kind2) {
10897 case PyUnicode_1BYTE_KIND:
10898 COMPARE(Py_UCS2, Py_UCS1);
10899 break;
10900 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010901 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 COMPARE(Py_UCS2, Py_UCS2);
10903 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010904 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010905 case PyUnicode_4BYTE_KIND:
10906 COMPARE(Py_UCS2, Py_UCS4);
10907 break;
10908 default:
10909 assert(0);
10910 }
10911 break;
10912 }
10913 case PyUnicode_4BYTE_KIND:
10914 {
10915 switch(kind2) {
10916 case PyUnicode_1BYTE_KIND:
10917 COMPARE(Py_UCS4, Py_UCS1);
10918 break;
10919 case PyUnicode_2BYTE_KIND:
10920 COMPARE(Py_UCS4, Py_UCS2);
10921 break;
10922 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 {
10924#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10925 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10926 /* normalize result of wmemcmp() into the range [-1; 1] */
10927 if (cmp < 0)
10928 return -1;
10929 if (cmp > 0)
10930 return 1;
10931#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010932 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010933#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936 default:
10937 assert(0);
10938 }
10939 break;
10940 }
10941 default:
10942 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010943 }
10944
Victor Stinner770e19e2012-10-04 22:59:45 +020010945 if (len1 == len2)
10946 return 0;
10947 if (len1 < len2)
10948 return -1;
10949 else
10950 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951
10952#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010953}
10954
Benjamin Peterson621b4302016-09-09 13:54:34 -070010955static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010956unicode_compare_eq(PyObject *str1, PyObject *str2)
10957{
10958 int kind;
10959 void *data1, *data2;
10960 Py_ssize_t len;
10961 int cmp;
10962
Victor Stinnere5567ad2012-10-23 02:48:49 +020010963 len = PyUnicode_GET_LENGTH(str1);
10964 if (PyUnicode_GET_LENGTH(str2) != len)
10965 return 0;
10966 kind = PyUnicode_KIND(str1);
10967 if (PyUnicode_KIND(str2) != kind)
10968 return 0;
10969 data1 = PyUnicode_DATA(str1);
10970 data2 = PyUnicode_DATA(str2);
10971
10972 cmp = memcmp(data1, data2, len * kind);
10973 return (cmp == 0);
10974}
10975
10976
Alexander Belopolsky40018472011-02-26 01:02:56 +000010977int
10978PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10981 if (PyUnicode_READY(left) == -1 ||
10982 PyUnicode_READY(right) == -1)
10983 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010984
10985 /* a string is equal to itself */
10986 if (left == right)
10987 return 0;
10988
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010989 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010991 PyErr_Format(PyExc_TypeError,
10992 "Can't compare %.100s and %.100s",
10993 left->ob_type->tp_name,
10994 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 return -1;
10996}
10997
Martin v. Löwis5b222132007-06-10 09:51:05 +000010998int
10999PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 Py_ssize_t i;
11002 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011004 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005
Victor Stinner910337b2011-10-03 03:20:16 +020011006 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011007 if (!PyUnicode_IS_READY(uni)) {
11008 const wchar_t *ws = _PyUnicode_WSTR(uni);
11009 /* Compare Unicode string and source character set string */
11010 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11011 if (chr != ustr[i])
11012 return (chr < ustr[i]) ? -1 : 1;
11013 }
11014 /* This check keeps Python strings that end in '\0' from comparing equal
11015 to C strings identical up to that point. */
11016 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11017 return 1; /* uni is longer */
11018 if (ustr[i])
11019 return -1; /* str is longer */
11020 return 0;
11021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011023 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011024 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011025 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011026 size_t len, len2 = strlen(str);
11027 int cmp;
11028
11029 len = Py_MIN(len1, len2);
11030 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011031 if (cmp != 0) {
11032 if (cmp < 0)
11033 return -1;
11034 else
11035 return 1;
11036 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 if (len1 > len2)
11038 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011039 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011040 return -1; /* str is longer */
11041 return 0;
11042 }
11043 else {
11044 void *data = PyUnicode_DATA(uni);
11045 /* Compare Unicode string and source character set string */
11046 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011047 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11049 /* This check keeps Python strings that end in '\0' from comparing equal
11050 to C strings identical up to that point. */
11051 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11052 return 1; /* uni is longer */
11053 if (str[i])
11054 return -1; /* str is longer */
11055 return 0;
11056 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011057}
11058
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011059static int
11060non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11061{
11062 size_t i, len;
11063 const wchar_t *p;
11064 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11065 if (strlen(str) != len)
11066 return 0;
11067 p = _PyUnicode_WSTR(unicode);
11068 assert(p);
11069 for (i = 0; i < len; i++) {
11070 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011071 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011072 return 0;
11073 }
11074 return 1;
11075}
11076
11077int
11078_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11079{
11080 size_t len;
11081 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011082 assert(str);
11083#ifndef NDEBUG
11084 for (const char *p = str; *p; p++) {
11085 assert((unsigned char)*p < 128);
11086 }
11087#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011088 if (PyUnicode_READY(unicode) == -1) {
11089 /* Memory error or bad data */
11090 PyErr_Clear();
11091 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11092 }
11093 if (!PyUnicode_IS_ASCII(unicode))
11094 return 0;
11095 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11096 return strlen(str) == len &&
11097 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11098}
11099
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011100int
11101_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11102{
11103 PyObject *right_uni;
11104 Py_hash_t hash;
11105
11106 assert(_PyUnicode_CHECK(left));
11107 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011108#ifndef NDEBUG
11109 for (const char *p = right->string; *p; p++) {
11110 assert((unsigned char)*p < 128);
11111 }
11112#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011113
11114 if (PyUnicode_READY(left) == -1) {
11115 /* memory error or bad data */
11116 PyErr_Clear();
11117 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11118 }
11119
11120 if (!PyUnicode_IS_ASCII(left))
11121 return 0;
11122
11123 right_uni = _PyUnicode_FromId(right); /* borrowed */
11124 if (right_uni == NULL) {
11125 /* memory error or bad data */
11126 PyErr_Clear();
11127 return _PyUnicode_EqualToASCIIString(left, right->string);
11128 }
11129
11130 if (left == right_uni)
11131 return 1;
11132
11133 if (PyUnicode_CHECK_INTERNED(left))
11134 return 0;
11135
11136 assert(_PyUnicode_HASH(right_uni) != 1);
11137 hash = _PyUnicode_HASH(left);
11138 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11139 return 0;
11140
11141 return unicode_compare_eq(left, right_uni);
11142}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011143
Benjamin Peterson29060642009-01-31 22:14:21 +000011144#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011145 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011146
Alexander Belopolsky40018472011-02-26 01:02:56 +000011147PyObject *
11148PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011149{
11150 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011151 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011152
Victor Stinnere5567ad2012-10-23 02:48:49 +020011153 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11154 Py_RETURN_NOTIMPLEMENTED;
11155
11156 if (PyUnicode_READY(left) == -1 ||
11157 PyUnicode_READY(right) == -1)
11158 return NULL;
11159
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011160 if (left == right) {
11161 switch (op) {
11162 case Py_EQ:
11163 case Py_LE:
11164 case Py_GE:
11165 /* a string is equal to itself */
11166 v = Py_True;
11167 break;
11168 case Py_NE:
11169 case Py_LT:
11170 case Py_GT:
11171 v = Py_False;
11172 break;
11173 default:
11174 PyErr_BadArgument();
11175 return NULL;
11176 }
11177 }
11178 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011179 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011180 result ^= (op == Py_NE);
11181 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011182 }
11183 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011184 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011185
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011186 /* Convert the return value to a Boolean */
11187 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011188 case Py_LE:
11189 v = TEST_COND(result <= 0);
11190 break;
11191 case Py_GE:
11192 v = TEST_COND(result >= 0);
11193 break;
11194 case Py_LT:
11195 v = TEST_COND(result == -1);
11196 break;
11197 case Py_GT:
11198 v = TEST_COND(result == 1);
11199 break;
11200 default:
11201 PyErr_BadArgument();
11202 return NULL;
11203 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011204 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011205 Py_INCREF(v);
11206 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011207}
11208
Alexander Belopolsky40018472011-02-26 01:02:56 +000011209int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011210_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211{
11212 return unicode_eq(aa, bb);
11213}
11214
11215int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011217{
Victor Stinner77282cb2013-04-14 19:22:47 +020011218 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 void *buf1, *buf2;
11220 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011221 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011222
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 "'in <string>' requires string as left operand, not %.100s",
11226 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 if (ensure_unicode(str) < 0)
11232 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 kind2 = PyUnicode_KIND(substr);
11236 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011239 len2 = PyUnicode_GET_LENGTH(substr);
11240 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011242 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 if (len2 == 1) {
11245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011247 return result;
11248 }
11249 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 buf2 = _PyUnicode_AsKind(substr, kind1);
11251 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011252 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254
Victor Stinner77282cb2013-04-14 19:22:47 +020011255 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 case PyUnicode_1BYTE_KIND:
11257 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_2BYTE_KIND:
11260 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 case PyUnicode_4BYTE_KIND:
11263 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264 break;
11265 default:
11266 result = -1;
11267 assert(0);
11268 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269
Victor Stinner77282cb2013-04-14 19:22:47 +020011270 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 PyMem_Free(buf2);
11272
Guido van Rossum403d68b2000-03-13 15:55:09 +000011273 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011274}
11275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276/* Concat to string or Unicode object giving a new Unicode object. */
11277
Alexander Belopolsky40018472011-02-26 01:02:56 +000011278PyObject *
11279PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011281 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011282 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011285 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (left == unicode_empty)
11290 return PyUnicode_FromObject(right);
11291 if (right == unicode_empty)
11292 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 left_len = PyUnicode_GET_LENGTH(left);
11295 right_len = PyUnicode_GET_LENGTH(right);
11296 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011297 PyErr_SetString(PyExc_OverflowError,
11298 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011300 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11304 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011305 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 result = PyUnicode_New(new_len, maxchar);
11309 if (result == NULL)
11310 return NULL;
11311 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11312 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11313 assert(_PyUnicode_CheckConsistency(result, 1));
11314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Walter Dörwald1ab83302007-05-18 17:15:44 +000011317void
Victor Stinner23e56682011-10-03 03:54:37 +020011318PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011319{
Victor Stinner23e56682011-10-03 03:54:37 +020011320 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011321 Py_UCS4 maxchar, maxchar2;
11322 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011323
11324 if (p_left == NULL) {
11325 if (!PyErr_Occurred())
11326 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011327 return;
11328 }
Victor Stinner23e56682011-10-03 03:54:37 +020011329 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011330 if (right == NULL || left == NULL
11331 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011332 if (!PyErr_Occurred())
11333 PyErr_BadInternalCall();
11334 goto error;
11335 }
11336
Benjamin Petersonbac79492012-01-14 13:34:47 -050011337 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011338 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011339 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011340 goto error;
11341
Victor Stinner488fa492011-12-12 00:01:39 +010011342 /* Shortcuts */
11343 if (left == unicode_empty) {
11344 Py_DECREF(left);
11345 Py_INCREF(right);
11346 *p_left = right;
11347 return;
11348 }
11349 if (right == unicode_empty)
11350 return;
11351
11352 left_len = PyUnicode_GET_LENGTH(left);
11353 right_len = PyUnicode_GET_LENGTH(right);
11354 if (left_len > PY_SSIZE_T_MAX - right_len) {
11355 PyErr_SetString(PyExc_OverflowError,
11356 "strings are too large to concat");
11357 goto error;
11358 }
11359 new_len = left_len + right_len;
11360
11361 if (unicode_modifiable(left)
11362 && PyUnicode_CheckExact(right)
11363 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011364 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11365 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011366 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011367 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011368 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11369 {
11370 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011371 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011372 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011373
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011374 /* copy 'right' into the newly allocated area of 'left' */
11375 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011376 }
Victor Stinner488fa492011-12-12 00:01:39 +010011377 else {
11378 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11379 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011380 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011381
Victor Stinner488fa492011-12-12 00:01:39 +010011382 /* Concat the two Unicode strings */
11383 res = PyUnicode_New(new_len, maxchar);
11384 if (res == NULL)
11385 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011386 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11387 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011388 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011389 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011390 }
11391 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011392 return;
11393
11394error:
Victor Stinner488fa492011-12-12 00:01:39 +010011395 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011396}
11397
11398void
11399PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 PyUnicode_Append(pleft, right);
11402 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011403}
11404
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011405/*
11406Wraps stringlib_parse_args_finds() and additionally ensures that the
11407first argument is a unicode object.
11408*/
11409
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011410static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011411parse_args_finds_unicode(const char * function_name, PyObject *args,
11412 PyObject **substring,
11413 Py_ssize_t *start, Py_ssize_t *end)
11414{
11415 if(stringlib_parse_args_finds(function_name, args, substring,
11416 start, end)) {
11417 if (ensure_unicode(*substring) < 0)
11418 return 0;
11419 return 1;
11420 }
11421 return 0;
11422}
11423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011427Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011428string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011432unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011434 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011435 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011436 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011438 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 void *buf1, *buf2;
11440 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011442 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 kind1 = PyUnicode_KIND(self);
11446 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011448 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 len1 = PyUnicode_GET_LENGTH(self);
11451 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 buf1 = PyUnicode_DATA(self);
11457 buf2 = PyUnicode_DATA(substring);
11458 if (kind2 != kind1) {
11459 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 }
11463 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 case PyUnicode_1BYTE_KIND:
11465 iresult = ucs1lib_count(
11466 ((Py_UCS1*)buf1) + start, end - start,
11467 buf2, len2, PY_SSIZE_T_MAX
11468 );
11469 break;
11470 case PyUnicode_2BYTE_KIND:
11471 iresult = ucs2lib_count(
11472 ((Py_UCS2*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 case PyUnicode_4BYTE_KIND:
11477 iresult = ucs4lib_count(
11478 ((Py_UCS4*)buf1) + start, end - start,
11479 buf2, len2, PY_SSIZE_T_MAX
11480 );
11481 break;
11482 default:
11483 assert(0); iresult = 0;
11484 }
11485
11486 result = PyLong_FromSsize_t(iresult);
11487
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011488 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 return result;
11492}
11493
INADA Naoki3ae20562017-01-16 20:41:20 +090011494/*[clinic input]
11495str.encode as unicode_encode
11496
11497 encoding: str(c_default="NULL") = 'utf-8'
11498 The encoding in which to encode the string.
11499 errors: str(c_default="NULL") = 'strict'
11500 The error handling scheme to use for encoding errors.
11501 The default is 'strict' meaning that encoding errors raise a
11502 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11503 'xmlcharrefreplace' as well as any other name registered with
11504 codecs.register_error that can handle UnicodeEncodeErrors.
11505
11506Encode the string using the codec registered for encoding.
11507[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011510unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011511/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011513 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011514}
11515
INADA Naoki3ae20562017-01-16 20:41:20 +090011516/*[clinic input]
11517str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
INADA Naoki3ae20562017-01-16 20:41:20 +090011519 tabsize: int = 8
11520
11521Return a copy where all tab characters are expanded using spaces.
11522
11523If tabsize is not given, a tab size of 8 characters is assumed.
11524[clinic start generated code]*/
11525
11526static PyObject *
11527unicode_expandtabs_impl(PyObject *self, int tabsize)
11528/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 Py_ssize_t i, j, line_pos, src_len, incr;
11531 Py_UCS4 ch;
11532 PyObject *u;
11533 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011534 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011535 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Antoine Pitrou22425222011-10-04 19:10:51 +020011537 if (PyUnicode_READY(self) == -1)
11538 return NULL;
11539
Thomas Wouters7e474022000-07-16 12:04:32 +000011540 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011541 src_len = PyUnicode_GET_LENGTH(self);
11542 i = j = line_pos = 0;
11543 kind = PyUnicode_KIND(self);
11544 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011545 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 for (; i < src_len; i++) {
11547 ch = PyUnicode_READ(kind, src_data, i);
11548 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011549 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 goto overflow;
11554 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011556 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 goto overflow;
11561 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011563 if (ch == '\n' || ch == '\r')
11564 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011567 if (!found)
11568 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011569
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 if (!u)
11573 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 for (; i < src_len; i++) {
11579 ch = PyUnicode_READ(kind, src_data, i);
11580 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 incr = tabsize - (line_pos % tabsize);
11583 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011584 FILL(kind, dest_data, ' ', j, incr);
11585 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011587 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 line_pos++;
11590 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011591 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 if (ch == '\n' || ch == '\r')
11593 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 }
11596 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011597 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011598
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
11607Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011608such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609arguments start and end are interpreted as in slice notation.\n\
11610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 /* initialize variables to prevent gcc warning */
11617 PyObject *substring = NULL;
11618 Py_ssize_t start = 0;
11619 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011625 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Christian Heimes217cfd12007-12-02 14:31:20 +000011633 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
11636static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011637unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011639 void *data;
11640 enum PyUnicode_Kind kind;
11641 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011642
11643 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11644 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011646 }
11647 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11648 PyErr_SetString(PyExc_IndexError, "string index out of range");
11649 return NULL;
11650 }
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_DATA(self);
11653 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011654 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655}
11656
Guido van Rossumc2504932007-09-18 19:42:40 +000011657/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011658 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011659static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Guido van Rossumc2504932007-09-18 19:42:40 +000011662 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011663 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011664
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011665#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011666 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011667#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (_PyUnicode_HASH(self) != -1)
11669 return _PyUnicode_HASH(self);
11670 if (PyUnicode_READY(self) == -1)
11671 return -1;
11672 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011673 /*
11674 We make the hash of the empty string be 0, rather than using
11675 (prefix ^ suffix), since this slightly obfuscates the hash secret
11676 */
11677 if (len == 0) {
11678 _PyUnicode_HASH(self) = 0;
11679 return 0;
11680 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011681 x = _Py_HashBytes(PyUnicode_DATA(self),
11682 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011684 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685}
11686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011695 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011696 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011697 PyObject *substring = NULL;
11698 Py_ssize_t start = 0;
11699 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011701 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011704 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011707 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (result == -2)
11710 return NULL;
11711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 if (result < 0) {
11713 PyErr_SetString(PyExc_ValueError, "substring not found");
11714 return NULL;
11715 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011716
Christian Heimes217cfd12007-12-02 14:31:20 +000011717 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718}
11719
INADA Naoki3ae20562017-01-16 20:41:20 +090011720/*[clinic input]
11721str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
INADA Naoki3ae20562017-01-16 20:41:20 +090011723Return True if the string is a lowercase string, False otherwise.
11724
11725A string is lowercase if all cased characters in the string are lowercase and
11726there is at least one cased character in the string.
11727[clinic start generated code]*/
11728
11729static PyObject *
11730unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011731/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 Py_ssize_t i, length;
11734 int kind;
11735 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 int cased;
11737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 length = PyUnicode_GET_LENGTH(self);
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
11743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 1)
11746 return PyBool_FromLong(
11747 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 for (i = 0; i < length; i++) {
11755 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011756
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11758 return PyBool_FromLong(0);
11759 else if (!cased && Py_UNICODE_ISLOWER(ch))
11760 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011762 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763}
11764
INADA Naoki3ae20562017-01-16 20:41:20 +090011765/*[clinic input]
11766str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
INADA Naoki3ae20562017-01-16 20:41:20 +090011768Return True if the string is an uppercase string, False otherwise.
11769
11770A string is uppercase if all cased characters in the string are uppercase and
11771there is at least one cased character in the string.
11772[clinic start generated code]*/
11773
11774static PyObject *
11775unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011776/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 Py_ssize_t i, length;
11779 int kind;
11780 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 int cased;
11782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (PyUnicode_READY(self) == -1)
11784 return NULL;
11785 length = PyUnicode_GET_LENGTH(self);
11786 kind = PyUnicode_KIND(self);
11787 data = PyUnicode_DATA(self);
11788
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 1)
11791 return PyBool_FromLong(
11792 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011794 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 for (i = 0; i < length; i++) {
11800 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011801
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11803 return PyBool_FromLong(0);
11804 else if (!cased && Py_UNICODE_ISUPPER(ch))
11805 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011807 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808}
11809
INADA Naoki3ae20562017-01-16 20:41:20 +090011810/*[clinic input]
11811str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
INADA Naoki3ae20562017-01-16 20:41:20 +090011813Return True if the string is a title-cased string, False otherwise.
11814
11815In a title-cased string, upper- and title-case characters may only
11816follow uncased characters and lowercase characters only cased ones.
11817[clinic start generated code]*/
11818
11819static PyObject *
11820unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011821/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 Py_ssize_t i, length;
11824 int kind;
11825 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 int cased, previous_is_cased;
11827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830 length = PyUnicode_GET_LENGTH(self);
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
11833
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (length == 1) {
11836 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11837 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11838 (Py_UNICODE_ISUPPER(ch) != 0));
11839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011841 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 cased = 0;
11846 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 for (i = 0; i < length; i++) {
11848 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011849
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11851 if (previous_is_cased)
11852 return PyBool_FromLong(0);
11853 previous_is_cased = 1;
11854 cased = 1;
11855 }
11856 else if (Py_UNICODE_ISLOWER(ch)) {
11857 if (!previous_is_cased)
11858 return PyBool_FromLong(0);
11859 previous_is_cased = 1;
11860 cased = 1;
11861 }
11862 else
11863 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011865 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868/*[clinic input]
11869str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871Return True if the string is a whitespace string, False otherwise.
11872
11873A string is whitespace if all characters in the string are whitespace and there
11874is at least one character in the string.
11875[clinic start generated code]*/
11876
11877static PyObject *
11878unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011879/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 Py_ssize_t i, length;
11882 int kind;
11883 void *data;
11884
11885 if (PyUnicode_READY(self) == -1)
11886 return NULL;
11887 length = PyUnicode_GET_LENGTH(self);
11888 kind = PyUnicode_KIND(self);
11889 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (length == 1)
11893 return PyBool_FromLong(
11894 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 for (i = 0; i < length; i++) {
11901 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011902 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011905 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908/*[clinic input]
11909str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011910
INADA Naoki3ae20562017-01-16 20:41:20 +090011911Return True if the string is an alphabetic string, False otherwise.
11912
11913A string is alphabetic if all characters in the string are alphabetic and there
11914is at least one character in the string.
11915[clinic start generated code]*/
11916
11917static PyObject *
11918unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011919/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 Py_ssize_t i, length;
11922 int kind;
11923 void *data;
11924
11925 if (PyUnicode_READY(self) == -1)
11926 return NULL;
11927 length = PyUnicode_GET_LENGTH(self);
11928 kind = PyUnicode_KIND(self);
11929 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (length == 1)
11933 return PyBool_FromLong(
11934 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935
11936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 for (i = 0; i < length; i++) {
11941 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011943 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011944 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945}
11946
INADA Naoki3ae20562017-01-16 20:41:20 +090011947/*[clinic input]
11948str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
INADA Naoki3ae20562017-01-16 20:41:20 +090011950Return True if the string is an alpha-numeric string, False otherwise.
11951
11952A string is alpha-numeric if all characters in the string are alpha-numeric and
11953there is at least one character in the string.
11954[clinic start generated code]*/
11955
11956static PyObject *
11957unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011958/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 int kind;
11961 void *data;
11962 Py_ssize_t len, i;
11963
11964 if (PyUnicode_READY(self) == -1)
11965 return NULL;
11966
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
11969 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (len == 1) {
11973 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11974 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11975 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976
11977 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 for (i = 0; i < len; i++) {
11982 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011983 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011986 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987}
11988
INADA Naoki3ae20562017-01-16 20:41:20 +090011989/*[clinic input]
11990str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
INADA Naoki3ae20562017-01-16 20:41:20 +090011992Return True if the string is a decimal string, False otherwise.
11993
11994A string is a decimal string if all characters in the string are decimal and
11995there is at least one character in the string.
11996[clinic start generated code]*/
11997
11998static PyObject *
11999unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012000/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 Py_ssize_t i, length;
12003 int kind;
12004 void *data;
12005
12006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008 length = PyUnicode_GET_LENGTH(self);
12009 kind = PyUnicode_KIND(self);
12010 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 1)
12014 return PyBool_FromLong(
12015 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012017 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 for (i = 0; i < length; i++) {
12022 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012025 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026}
12027
INADA Naoki3ae20562017-01-16 20:41:20 +090012028/*[clinic input]
12029str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
INADA Naoki3ae20562017-01-16 20:41:20 +090012031Return True if the string is a digit string, False otherwise.
12032
12033A string is a digit string if all characters in the string are digits and there
12034is at least one character in the string.
12035[clinic start generated code]*/
12036
12037static PyObject *
12038unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012039/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 Py_ssize_t i, length;
12042 int kind;
12043 void *data;
12044
12045 if (PyUnicode_READY(self) == -1)
12046 return NULL;
12047 length = PyUnicode_GET_LENGTH(self);
12048 kind = PyUnicode_KIND(self);
12049 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 if (length == 1) {
12053 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12054 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 for (i = 0; i < length; i++) {
12062 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012065 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
INADA Naoki3ae20562017-01-16 20:41:20 +090012068/*[clinic input]
12069str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
INADA Naoki3ae20562017-01-16 20:41:20 +090012071Return True if the string is a numeric string, False otherwise.
12072
12073A string is numeric if all characters in the string are numeric and there is at
12074least one character in the string.
12075[clinic start generated code]*/
12076
12077static PyObject *
12078unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012079/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 Py_ssize_t i, length;
12082 int kind;
12083 void *data;
12084
12085 if (PyUnicode_READY(self) == -1)
12086 return NULL;
12087 length = PyUnicode_GET_LENGTH(self);
12088 kind = PyUnicode_KIND(self);
12089 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (length == 1)
12093 return PyBool_FromLong(
12094 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012096 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 for (i = 0; i < length; i++) {
12101 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012104 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Martin v. Löwis47383402007-08-15 07:32:56 +000012107int
12108PyUnicode_IsIdentifier(PyObject *self)
12109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 int kind;
12111 void *data;
12112 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012113 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PyUnicode_READY(self) == -1) {
12116 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 }
12119
12120 /* Special case for empty strings */
12121 if (PyUnicode_GET_LENGTH(self) == 0)
12122 return 0;
12123 kind = PyUnicode_KIND(self);
12124 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012125
12126 /* PEP 3131 says that the first character must be in
12127 XID_Start and subsequent characters in XID_Continue,
12128 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012130 letters, digits, underscore). However, given the current
12131 definition of XID_Start and XID_Continue, it is sufficient
12132 to check just for these, except that _ must be allowed
12133 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012135 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012136 return 0;
12137
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012138 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012141 return 1;
12142}
12143
INADA Naoki3ae20562017-01-16 20:41:20 +090012144/*[clinic input]
12145str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012146
INADA Naoki3ae20562017-01-16 20:41:20 +090012147Return True if the string is a valid Python identifier, False otherwise.
12148
12149Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12150"class".
12151[clinic start generated code]*/
12152
12153static PyObject *
12154unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012155/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012156{
12157 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is printable, False otherwise.
12164
12165A string is printable if all of its characters are considered printable in
12166repr() or if it is empty.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t i, length;
12174 int kind;
12175 void *data;
12176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179 length = PyUnicode_GET_LENGTH(self);
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012182
12183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (length == 1)
12185 return PyBool_FromLong(
12186 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 for (i = 0; i < length; i++) {
12189 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012190 Py_RETURN_FALSE;
12191 }
12192 }
12193 Py_RETURN_TRUE;
12194}
12195
INADA Naoki3ae20562017-01-16 20:41:20 +090012196/*[clinic input]
12197str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199 iterable: object
12200 /
12201
12202Concatenate any number of strings.
12203
Martin Panter91a88662017-01-24 00:30:06 +000012204The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012205The result is returned as a new string.
12206
12207Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12208[clinic start generated code]*/
12209
12210static PyObject *
12211unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012212/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213{
INADA Naoki3ae20562017-01-16 20:41:20 +090012214 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215}
12216
Martin v. Löwis18e16552006-02-15 17:27:45 +000012217static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012218unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (PyUnicode_READY(self) == -1)
12221 return -1;
12222 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223}
12224
INADA Naoki3ae20562017-01-16 20:41:20 +090012225/*[clinic input]
12226str.ljust as unicode_ljust
12227
12228 width: Py_ssize_t
12229 fillchar: Py_UCS4 = ' '
12230 /
12231
12232Return a left-justified string of length width.
12233
12234Padding is done using the specified fill character (default is a space).
12235[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
12237static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012238unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12239/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012241 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243
Victor Stinnerc4b49542011-12-11 22:44:26 +010012244 if (PyUnicode_GET_LENGTH(self) >= width)
12245 return unicode_result_unchanged(self);
12246
12247 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
INADA Naoki3ae20562017-01-16 20:41:20 +090012250/*[clinic input]
12251str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
INADA Naoki3ae20562017-01-16 20:41:20 +090012253Return a copy of the string converted to lowercase.
12254[clinic start generated code]*/
12255
12256static PyObject *
12257unicode_lower_impl(PyObject *self)
12258/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012260 if (PyUnicode_READY(self) == -1)
12261 return NULL;
12262 if (PyUnicode_IS_ASCII(self))
12263 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012264 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012267#define LEFTSTRIP 0
12268#define RIGHTSTRIP 1
12269#define BOTHSTRIP 2
12270
12271/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012272static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273
INADA Naoki3ae20562017-01-16 20:41:20 +090012274#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012276/* externally visible for str.strip(unicode) */
12277PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012278_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 void *data;
12281 int kind;
12282 Py_ssize_t i, j, len;
12283 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012284 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12287 return NULL;
12288
12289 kind = PyUnicode_KIND(self);
12290 data = PyUnicode_DATA(self);
12291 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012292 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12294 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012295 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 i = 0;
12298 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012299 while (i < len) {
12300 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12301 if (!BLOOM(sepmask, ch))
12302 break;
12303 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 i++;
12306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012308
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 j = len;
12310 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012311 j--;
12312 while (j >= i) {
12313 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12314 if (!BLOOM(sepmask, ch))
12315 break;
12316 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12317 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012319 }
12320
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
Victor Stinner7931d9a2011-11-04 00:22:48 +010012324 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325}
12326
12327PyObject*
12328PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12329{
12330 unsigned char *data;
12331 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012332 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333
Victor Stinnerde636f32011-10-01 03:55:54 +020012334 if (PyUnicode_READY(self) == -1)
12335 return NULL;
12336
Victor Stinner684d5fd2012-05-03 02:32:34 +020012337 length = PyUnicode_GET_LENGTH(self);
12338 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012339
Victor Stinner684d5fd2012-05-03 02:32:34 +020012340 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012341 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342
Victor Stinnerde636f32011-10-01 03:55:54 +020012343 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012344 PyErr_SetString(PyExc_IndexError, "string index out of range");
12345 return NULL;
12346 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012347 if (start >= length || end < start)
12348 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012349
Victor Stinner684d5fd2012-05-03 02:32:34 +020012350 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012351 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012352 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012353 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012354 }
12355 else {
12356 kind = PyUnicode_KIND(self);
12357 data = PyUnicode_1BYTE_DATA(self);
12358 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012359 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012360 length);
12361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
12364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012365do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 Py_ssize_t len, i, j;
12368
12369 if (PyUnicode_READY(self) == -1)
12370 return NULL;
12371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012373
Victor Stinnercc7af722013-04-09 22:39:24 +020012374 if (PyUnicode_IS_ASCII(self)) {
12375 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12376
12377 i = 0;
12378 if (striptype != RIGHTSTRIP) {
12379 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012380 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012381 if (!_Py_ascii_whitespace[ch])
12382 break;
12383 i++;
12384 }
12385 }
12386
12387 j = len;
12388 if (striptype != LEFTSTRIP) {
12389 j--;
12390 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012391 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012392 if (!_Py_ascii_whitespace[ch])
12393 break;
12394 j--;
12395 }
12396 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 }
12398 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012399 else {
12400 int kind = PyUnicode_KIND(self);
12401 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402
Victor Stinnercc7af722013-04-09 22:39:24 +020012403 i = 0;
12404 if (striptype != RIGHTSTRIP) {
12405 while (i < len) {
12406 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12407 if (!Py_UNICODE_ISSPACE(ch))
12408 break;
12409 i++;
12410 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012411 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012412
12413 j = len;
12414 if (striptype != LEFTSTRIP) {
12415 j--;
12416 while (j >= i) {
12417 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12418 if (!Py_UNICODE_ISSPACE(ch))
12419 break;
12420 j--;
12421 }
12422 j++;
12423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012425
Victor Stinner7931d9a2011-11-04 00:22:48 +010012426 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427}
12428
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012429
12430static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012431do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 if (sep != NULL && sep != Py_None) {
12434 if (PyUnicode_Check(sep))
12435 return _PyUnicode_XStrip(self, striptype, sep);
12436 else {
12437 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 "%s arg must be None or str",
12439 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440 return NULL;
12441 }
12442 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012445}
12446
12447
INADA Naoki3ae20562017-01-16 20:41:20 +090012448/*[clinic input]
12449str.strip as unicode_strip
12450
12451 chars: object = None
12452 /
12453
Victor Stinner0c4a8282017-01-17 02:21:47 +010012454Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012455
12456If chars is given and not None, remove characters in chars instead.
12457[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
12459static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012460unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012461/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462{
INADA Naoki3ae20562017-01-16 20:41:20 +090012463 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464}
12465
12466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467/*[clinic input]
12468str.lstrip as unicode_lstrip
12469
12470 chars: object = NULL
12471 /
12472
12473Return a copy of the string with leading whitespace removed.
12474
12475If chars is given and not None, remove characters in chars instead.
12476[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477
12478static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012479unicode_lstrip_impl(PyObject *self, PyObject *chars)
12480/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481{
INADA Naoki3ae20562017-01-16 20:41:20 +090012482 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483}
12484
12485
INADA Naoki3ae20562017-01-16 20:41:20 +090012486/*[clinic input]
12487str.rstrip as unicode_rstrip
12488
12489 chars: object = NULL
12490 /
12491
12492Return a copy of the string with trailing whitespace removed.
12493
12494If chars is given and not None, remove characters in chars instead.
12495[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496
12497static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012498unicode_rstrip_impl(PyObject *self, PyObject *chars)
12499/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500{
INADA Naoki3ae20562017-01-16 20:41:20 +090012501 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502}
12503
12504
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012506unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012508 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510
Serhiy Storchaka05997252013-01-26 12:14:02 +020012511 if (len < 1)
12512 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
Victor Stinnerc4b49542011-12-11 22:44:26 +010012514 /* no repeat, return original string */
12515 if (len == 1)
12516 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012517
Benjamin Petersonbac79492012-01-14 13:34:47 -050012518 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 return NULL;
12520
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012521 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012522 PyErr_SetString(PyExc_OverflowError,
12523 "repeated string is too long");
12524 return NULL;
12525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012527
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 if (!u)
12530 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012531 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 if (PyUnicode_GET_LENGTH(str) == 1) {
12534 const int kind = PyUnicode_KIND(str);
12535 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012536 if (kind == PyUnicode_1BYTE_KIND) {
12537 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012538 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012539 }
12540 else if (kind == PyUnicode_2BYTE_KIND) {
12541 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012542 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012543 ucs2[n] = fill_char;
12544 } else {
12545 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12546 assert(kind == PyUnicode_4BYTE_KIND);
12547 for (n = 0; n < len; ++n)
12548 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 }
12551 else {
12552 /* number of characters copied this far */
12553 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012554 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012556 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012560 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 }
12564
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012565 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012566 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567}
12568
Alexander Belopolsky40018472011-02-26 01:02:56 +000012569PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012570PyUnicode_Replace(PyObject *str,
12571 PyObject *substr,
12572 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012573 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012575 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12576 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012578 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579}
12580
INADA Naoki3ae20562017-01-16 20:41:20 +090012581/*[clinic input]
12582str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
INADA Naoki3ae20562017-01-16 20:41:20 +090012584 old: unicode
12585 new: unicode
12586 count: Py_ssize_t = -1
12587 Maximum number of occurrences to replace.
12588 -1 (the default value) means replace all occurrences.
12589 /
12590
12591Return a copy with all occurrences of substring old replaced by new.
12592
12593If the optional argument count is given, only the first count occurrences are
12594replaced.
12595[clinic start generated code]*/
12596
12597static PyObject *
12598unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12599 Py_ssize_t count)
12600/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012602 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012604 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605}
12606
Alexander Belopolsky40018472011-02-26 01:02:56 +000012607static PyObject *
12608unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012610 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 Py_ssize_t isize;
12612 Py_ssize_t osize, squote, dquote, i, o;
12613 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012614 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012618 return NULL;
12619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 isize = PyUnicode_GET_LENGTH(unicode);
12621 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 /* Compute length of output, quote characters, and
12624 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012625 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 max = 127;
12627 squote = dquote = 0;
12628 ikind = PyUnicode_KIND(unicode);
12629 for (i = 0; i < isize; i++) {
12630 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012631 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012633 case '\'': squote++; break;
12634 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012636 incr = 2;
12637 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 default:
12639 /* Fast-path ASCII */
12640 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012641 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 ;
12644 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 if (osize > PY_SSIZE_T_MAX - incr) {
12654 PyErr_SetString(PyExc_OverflowError,
12655 "string is too long to generate repr");
12656 return NULL;
12657 }
12658 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 }
12660
12661 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012662 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012664 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 if (dquote)
12666 /* Both squote and dquote present. Use squote,
12667 and escape them */
12668 osize += squote;
12669 else
12670 quote = '"';
12671 }
Victor Stinner55c08782013-04-14 18:45:39 +020012672 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673
12674 repr = PyUnicode_New(osize, max);
12675 if (repr == NULL)
12676 return NULL;
12677 okind = PyUnicode_KIND(repr);
12678 odata = PyUnicode_DATA(repr);
12679
12680 PyUnicode_WRITE(okind, odata, 0, quote);
12681 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012682 if (unchanged) {
12683 _PyUnicode_FastCopyCharacters(repr, 1,
12684 unicode, 0,
12685 isize);
12686 }
12687 else {
12688 for (i = 0, o = 1; i < isize; i++) {
12689 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690
Victor Stinner55c08782013-04-14 18:45:39 +020012691 /* Escape quotes and backslashes */
12692 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012693 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012695 continue;
12696 }
12697
12698 /* Map special whitespace to '\t', \n', '\r' */
12699 if (ch == '\t') {
12700 PyUnicode_WRITE(okind, odata, o++, '\\');
12701 PyUnicode_WRITE(okind, odata, o++, 't');
12702 }
12703 else if (ch == '\n') {
12704 PyUnicode_WRITE(okind, odata, o++, '\\');
12705 PyUnicode_WRITE(okind, odata, o++, 'n');
12706 }
12707 else if (ch == '\r') {
12708 PyUnicode_WRITE(okind, odata, o++, '\\');
12709 PyUnicode_WRITE(okind, odata, o++, 'r');
12710 }
12711
12712 /* Map non-printable US ASCII to '\xhh' */
12713 else if (ch < ' ' || ch == 0x7F) {
12714 PyUnicode_WRITE(okind, odata, o++, '\\');
12715 PyUnicode_WRITE(okind, odata, o++, 'x');
12716 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12717 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12718 }
12719
12720 /* Copy ASCII characters as-is */
12721 else if (ch < 0x7F) {
12722 PyUnicode_WRITE(okind, odata, o++, ch);
12723 }
12724
12725 /* Non-ASCII characters */
12726 else {
12727 /* Map Unicode whitespace and control characters
12728 (categories Z* and C* except ASCII space)
12729 */
12730 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 /* Map 8-bit characters to '\xhh' */
12733 if (ch <= 0xff) {
12734 PyUnicode_WRITE(okind, odata, o++, 'x');
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12737 }
12738 /* Map 16-bit characters to '\uxxxx' */
12739 else if (ch <= 0xffff) {
12740 PyUnicode_WRITE(okind, odata, o++, 'u');
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12745 }
12746 /* Map 21-bit characters to '\U00xxxxxx' */
12747 else {
12748 PyUnicode_WRITE(okind, odata, o++, 'U');
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12757 }
12758 }
12759 /* Copy characters as-is */
12760 else {
12761 PyUnicode_WRITE(okind, odata, o++, ch);
12762 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012763 }
12764 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012767 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769}
12770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012771PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773\n\
12774Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012775such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776arguments start and end are interpreted as in slice notation.\n\
12777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012778Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
12780static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012783 /* initialize variables to prevent gcc warning */
12784 PyObject *substring = NULL;
12785 Py_ssize_t start = 0;
12786 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012789 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012792 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012795 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 if (result == -2)
12798 return NULL;
12799
Christian Heimes217cfd12007-12-02 14:31:20 +000012800 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801}
12802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012806Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
12808static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012811 /* initialize variables to prevent gcc warning */
12812 PyObject *substring = NULL;
12813 Py_ssize_t start = 0;
12814 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012817 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012820 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012823 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 if (result == -2)
12826 return NULL;
12827
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828 if (result < 0) {
12829 PyErr_SetString(PyExc_ValueError, "substring not found");
12830 return NULL;
12831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832
Christian Heimes217cfd12007-12-02 14:31:20 +000012833 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834}
12835
INADA Naoki3ae20562017-01-16 20:41:20 +090012836/*[clinic input]
12837str.rjust as unicode_rjust
12838
12839 width: Py_ssize_t
12840 fillchar: Py_UCS4 = ' '
12841 /
12842
12843Return a right-justified string of length width.
12844
12845Padding is done using the specified fill character (default is a space).
12846[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
12848static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012849unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12850/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012852 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 return NULL;
12854
Victor Stinnerc4b49542011-12-11 22:44:26 +010012855 if (PyUnicode_GET_LENGTH(self) >= width)
12856 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857
Victor Stinnerc4b49542011-12-11 22:44:26 +010012858 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859}
12860
Alexander Belopolsky40018472011-02-26 01:02:56 +000012861PyObject *
12862PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012864 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012867 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
INADA Naoki3ae20562017-01-16 20:41:20 +090012870/*[clinic input]
12871str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
INADA Naoki3ae20562017-01-16 20:41:20 +090012873 sep: object = None
12874 The delimiter according which to split the string.
12875 None (the default value) means split according to any whitespace,
12876 and discard empty strings from the result.
12877 maxsplit: Py_ssize_t = -1
12878 Maximum number of splits to do.
12879 -1 (the default value) means no limit.
12880
12881Return a list of the words in the string, using sep as the delimiter string.
12882[clinic start generated code]*/
12883
12884static PyObject *
12885unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12886/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
INADA Naoki3ae20562017-01-16 20:41:20 +090012888 if (sep == Py_None)
12889 return split(self, NULL, maxsplit);
12890 if (PyUnicode_Check(sep))
12891 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892
12893 PyErr_Format(PyExc_TypeError,
12894 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012895 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012903 int kind1, kind2;
12904 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
Victor Stinner14f8f022011-10-05 20:58:25 +020012910 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 len1 = PyUnicode_GET_LENGTH(str_obj);
12913 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012914 if (kind1 < kind2 || len1 < len2) {
12915 _Py_INCREF_UNICODE_EMPTY();
12916 if (!unicode_empty)
12917 out = NULL;
12918 else {
12919 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12920 Py_DECREF(unicode_empty);
12921 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 return out;
12923 }
12924 buf1 = PyUnicode_DATA(str_obj);
12925 buf2 = PyUnicode_DATA(sep_obj);
12926 if (kind2 != kind1) {
12927 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12928 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012934 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12935 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936 else
12937 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 break;
12939 case PyUnicode_2BYTE_KIND:
12940 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941 break;
12942 case PyUnicode_4BYTE_KIND:
12943 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944 break;
12945 default:
12946 assert(0);
12947 out = 0;
12948 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012949
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012952
12953 return out;
12954}
12955
12956
12957PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012958PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012961 int kind1, kind2;
12962 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012964
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012965 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012968 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 len1 = PyUnicode_GET_LENGTH(str_obj);
12971 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012972 if (kind1 < kind2 || len1 < len2) {
12973 _Py_INCREF_UNICODE_EMPTY();
12974 if (!unicode_empty)
12975 out = NULL;
12976 else {
12977 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12978 Py_DECREF(unicode_empty);
12979 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012980 return out;
12981 }
12982 buf1 = PyUnicode_DATA(str_obj);
12983 buf2 = PyUnicode_DATA(sep_obj);
12984 if (kind2 != kind1) {
12985 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12986 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012987 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012992 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12993 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12994 else
12995 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 break;
12997 case PyUnicode_2BYTE_KIND:
12998 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12999 break;
13000 case PyUnicode_4BYTE_KIND:
13001 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13002 break;
13003 default:
13004 assert(0);
13005 out = 0;
13006 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013010
13011 return out;
13012}
13013
INADA Naoki3ae20562017-01-16 20:41:20 +090013014/*[clinic input]
13015str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013016
INADA Naoki3ae20562017-01-16 20:41:20 +090013017 sep: object
13018 /
13019
13020Partition the string into three parts using the given separator.
13021
13022This will search for the separator in the string. If the separator is found,
13023returns a 3-tuple containing the part before the separator, the separator
13024itself, and the part after it.
13025
13026If the separator is not found, returns a 3-tuple containing the original string
13027and two empty strings.
13028[clinic start generated code]*/
13029
13030static PyObject *
13031unicode_partition(PyObject *self, PyObject *sep)
13032/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033{
INADA Naoki3ae20562017-01-16 20:41:20 +090013034 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035}
13036
INADA Naoki3ae20562017-01-16 20:41:20 +090013037/*[clinic input]
13038str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013039
INADA Naoki3ae20562017-01-16 20:41:20 +090013040Partition the string into three parts using the given separator.
13041
13042This will search for the separator in the string, starting and the end. If
13043the separator is found, returns a 3-tuple containing the part before the
13044separator, the separator itself, and the part after it.
13045
13046If the separator is not found, returns a 3-tuple containing two empty strings
13047and the original string.
13048[clinic start generated code]*/
13049
13050static PyObject *
13051unicode_rpartition(PyObject *self, PyObject *sep)
13052/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053{
INADA Naoki3ae20562017-01-16 20:41:20 +090013054 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055}
13056
Alexander Belopolsky40018472011-02-26 01:02:56 +000013057PyObject *
13058PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013059{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013060 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013062
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013063 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013064}
13065
INADA Naoki3ae20562017-01-16 20:41:20 +090013066/*[clinic input]
13067str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013068
INADA Naoki3ae20562017-01-16 20:41:20 +090013069Return a list of the words in the string, using sep as the delimiter string.
13070
13071Splits are done starting at the end of the string and working to the front.
13072[clinic start generated code]*/
13073
13074static PyObject *
13075unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13076/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013077{
INADA Naoki3ae20562017-01-16 20:41:20 +090013078 if (sep == Py_None)
13079 return rsplit(self, NULL, maxsplit);
13080 if (PyUnicode_Check(sep))
13081 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082
13083 PyErr_Format(PyExc_TypeError,
13084 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013085 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013086 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087}
13088
INADA Naoki3ae20562017-01-16 20:41:20 +090013089/*[clinic input]
13090str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091
INADA Naoki3ae20562017-01-16 20:41:20 +090013092 keepends: int(c_default="0") = False
13093
13094Return a list of the lines in the string, breaking at line boundaries.
13095
13096Line breaks are not included in the resulting list unless keepends is given and
13097true.
13098[clinic start generated code]*/
13099
13100static PyObject *
13101unicode_splitlines_impl(PyObject *self, int keepends)
13102/*[clinic end generated code: output=f664dcdad153ec40 input=d6ff99fe43465b0f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105}
13106
13107static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013108PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013110 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111}
13112
INADA Naoki3ae20562017-01-16 20:41:20 +090013113/*[clinic input]
13114str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
INADA Naoki3ae20562017-01-16 20:41:20 +090013116Convert uppercase characters to lowercase and lowercase characters to uppercase.
13117[clinic start generated code]*/
13118
13119static PyObject *
13120unicode_swapcase_impl(PyObject *self)
13121/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013123 if (PyUnicode_READY(self) == -1)
13124 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013125 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126}
13127
Larry Hastings61272b72014-01-07 12:41:53 -080013128/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013129
Larry Hastings31826802013-10-19 00:09:25 -070013130@staticmethod
13131str.maketrans as unicode_maketrans
13132
13133 x: object
13134
13135 y: unicode=NULL
13136
13137 z: unicode=NULL
13138
13139 /
13140
13141Return a translation table usable for str.translate().
13142
13143If there is only one argument, it must be a dictionary mapping Unicode
13144ordinals (integers) or characters to Unicode ordinals, strings or None.
13145Character keys will be then converted to ordinals.
13146If there are two arguments, they must be strings of equal length, and
13147in the resulting dictionary, each character in x will be mapped to the
13148character at the same position in y. If there is a third argument, it
13149must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013150[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013151
Larry Hastings31826802013-10-19 00:09:25 -070013152static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013153unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013154/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013155{
Georg Brandlceee0772007-11-27 23:48:05 +000013156 PyObject *new = NULL, *key, *value;
13157 Py_ssize_t i = 0;
13158 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159
Georg Brandlceee0772007-11-27 23:48:05 +000013160 new = PyDict_New();
13161 if (!new)
13162 return NULL;
13163 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 int x_kind, y_kind, z_kind;
13165 void *x_data, *y_data, *z_data;
13166
Georg Brandlceee0772007-11-27 23:48:05 +000013167 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013168 if (!PyUnicode_Check(x)) {
13169 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13170 "be a string if there is a second argument");
13171 goto err;
13172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013174 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13175 "arguments must have equal length");
13176 goto err;
13177 }
13178 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 x_kind = PyUnicode_KIND(x);
13180 y_kind = PyUnicode_KIND(y);
13181 x_data = PyUnicode_DATA(x);
13182 y_data = PyUnicode_DATA(y);
13183 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13184 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013185 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013186 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013187 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013188 if (!value) {
13189 Py_DECREF(key);
13190 goto err;
13191 }
Georg Brandlceee0772007-11-27 23:48:05 +000013192 res = PyDict_SetItem(new, key, value);
13193 Py_DECREF(key);
13194 Py_DECREF(value);
13195 if (res < 0)
13196 goto err;
13197 }
13198 /* create entries for deleting chars in z */
13199 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 z_kind = PyUnicode_KIND(z);
13201 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013202 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013204 if (!key)
13205 goto err;
13206 res = PyDict_SetItem(new, key, Py_None);
13207 Py_DECREF(key);
13208 if (res < 0)
13209 goto err;
13210 }
13211 }
13212 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 int kind;
13214 void *data;
13215
Georg Brandlceee0772007-11-27 23:48:05 +000013216 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013217 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013218 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13219 "to maketrans it must be a dict");
13220 goto err;
13221 }
13222 /* copy entries into the new dict, converting string keys to int keys */
13223 while (PyDict_Next(x, &i, &key, &value)) {
13224 if (PyUnicode_Check(key)) {
13225 /* convert string keys to integer keys */
13226 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013227 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013228 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13229 "table must be of length 1");
13230 goto err;
13231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 kind = PyUnicode_KIND(key);
13233 data = PyUnicode_DATA(key);
13234 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013235 if (!newkey)
13236 goto err;
13237 res = PyDict_SetItem(new, newkey, value);
13238 Py_DECREF(newkey);
13239 if (res < 0)
13240 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013241 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013242 /* just keep integer keys */
13243 if (PyDict_SetItem(new, key, value) < 0)
13244 goto err;
13245 } else {
13246 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13247 "be strings or integers");
13248 goto err;
13249 }
13250 }
13251 }
13252 return new;
13253 err:
13254 Py_DECREF(new);
13255 return NULL;
13256}
13257
INADA Naoki3ae20562017-01-16 20:41:20 +090013258/*[clinic input]
13259str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260
INADA Naoki3ae20562017-01-16 20:41:20 +090013261 table: object
13262 Translation table, which must be a mapping of Unicode ordinals to
13263 Unicode ordinals, strings, or None.
13264 /
13265
13266Replace each character in the string using the given translation table.
13267
13268The table must implement lookup/indexing via __getitem__, for instance a
13269dictionary or list. If this operation raises LookupError, the character is
13270left untouched. Characters mapped to None are deleted.
13271[clinic start generated code]*/
13272
13273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013275/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278}
13279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280/*[clinic input]
13281str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282
INADA Naoki3ae20562017-01-16 20:41:20 +090013283Return a copy of the string converted to uppercase.
13284[clinic start generated code]*/
13285
13286static PyObject *
13287unicode_upper_impl(PyObject *self)
13288/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013290 if (PyUnicode_READY(self) == -1)
13291 return NULL;
13292 if (PyUnicode_IS_ASCII(self))
13293 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013294 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295}
13296
INADA Naoki3ae20562017-01-16 20:41:20 +090013297/*[clinic input]
13298str.zfill as unicode_zfill
13299
13300 width: Py_ssize_t
13301 /
13302
13303Pad a numeric string with zeros on the left, to fill a field of the given width.
13304
13305The string is never truncated.
13306[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307
13308static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013309unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013310/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013312 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013313 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 int kind;
13315 void *data;
13316 Py_UCS4 chr;
13317
Benjamin Petersonbac79492012-01-14 13:34:47 -050013318 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320
Victor Stinnerc4b49542011-12-11 22:44:26 +010013321 if (PyUnicode_GET_LENGTH(self) >= width)
13322 return unicode_result_unchanged(self);
13323
13324 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325
13326 u = pad(self, fill, 0, '0');
13327
Walter Dörwald068325e2002-04-15 13:36:47 +000013328 if (u == NULL)
13329 return NULL;
13330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 kind = PyUnicode_KIND(u);
13332 data = PyUnicode_DATA(u);
13333 chr = PyUnicode_READ(kind, data, fill);
13334
13335 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 PyUnicode_WRITE(kind, data, 0, chr);
13338 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 }
13340
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013341 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013342 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
13345#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013346static PyObject *
13347unicode__decimal2ascii(PyObject *self)
13348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013350}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351#endif
13352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013353PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013356Return True if S starts with the specified prefix, False otherwise.\n\
13357With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013358With optional end, stop comparing S at that position.\n\
13359prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360
13361static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013362unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013363 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013365 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013366 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013367 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013368 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013369 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370
Jesus Ceaac451502011-04-20 17:09:23 +020013371 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013373 if (PyTuple_Check(subobj)) {
13374 Py_ssize_t i;
13375 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376 substring = PyTuple_GET_ITEM(subobj, i);
13377 if (!PyUnicode_Check(substring)) {
13378 PyErr_Format(PyExc_TypeError,
13379 "tuple for startswith must only contain str, "
13380 "not %.100s",
13381 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013385 if (result == -1)
13386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 if (result) {
13388 Py_RETURN_TRUE;
13389 }
13390 }
13391 /* nothing matched */
13392 Py_RETURN_FALSE;
13393 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 if (!PyUnicode_Check(subobj)) {
13395 PyErr_Format(PyExc_TypeError,
13396 "startswith first arg must be str or "
13397 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013399 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013401 if (result == -1)
13402 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404}
13405
13406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013407PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013410Return True if S ends with the specified suffix, False otherwise.\n\
13411With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412With optional end, stop comparing S at that position.\n\
13413suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
13415static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013416unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013420 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013421 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013422 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013423 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424
Jesus Ceaac451502011-04-20 17:09:23 +020013425 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013427 if (PyTuple_Check(subobj)) {
13428 Py_ssize_t i;
13429 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013430 substring = PyTuple_GET_ITEM(subobj, i);
13431 if (!PyUnicode_Check(substring)) {
13432 PyErr_Format(PyExc_TypeError,
13433 "tuple for endswith must only contain str, "
13434 "not %.100s",
13435 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013437 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013439 if (result == -1)
13440 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013441 if (result) {
13442 Py_RETURN_TRUE;
13443 }
13444 }
13445 Py_RETURN_FALSE;
13446 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013447 if (!PyUnicode_Check(subobj)) {
13448 PyErr_Format(PyExc_TypeError,
13449 "endswith first arg must be str or "
13450 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013452 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013453 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013454 if (result == -1)
13455 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013456 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457}
13458
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013459static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013460_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013461{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013462 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13463 writer->data = PyUnicode_DATA(writer->buffer);
13464
13465 if (!writer->readonly) {
13466 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013468 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013469 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013470 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13471 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13472 writer->kind = PyUnicode_WCHAR_KIND;
13473 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13474
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475 /* Copy-on-write mode: set buffer size to 0 so
13476 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13477 * next write. */
13478 writer->size = 0;
13479 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013480}
13481
Victor Stinnerd3f08822012-05-29 12:57:52 +020013482void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013483_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013484{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013486
13487 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489
13490 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13491 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13492 writer->kind = PyUnicode_WCHAR_KIND;
13493 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013494}
13495
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496int
13497_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13498 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013499{
13500 Py_ssize_t newlen;
13501 PyObject *newbuffer;
13502
Victor Stinner2740e462016-09-06 16:58:36 -070013503 assert(maxchar <= MAX_UNICODE);
13504
Victor Stinnerca9381e2015-09-22 00:58:32 +020013505 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013506 assert((maxchar > writer->maxchar && length >= 0)
13507 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013508
Victor Stinner202fdca2012-05-07 12:47:02 +020013509 if (length > PY_SSIZE_T_MAX - writer->pos) {
13510 PyErr_NoMemory();
13511 return -1;
13512 }
13513 newlen = writer->pos + length;
13514
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013515 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013518 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013519 if (writer->overallocate
13520 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13521 /* overallocate to limit the number of realloc() */
13522 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013524 if (newlen < writer->min_length)
13525 newlen = writer->min_length;
13526
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527 writer->buffer = PyUnicode_New(newlen, maxchar);
13528 if (writer->buffer == NULL)
13529 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013531 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013532 if (writer->overallocate
13533 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13534 /* overallocate to limit the number of realloc() */
13535 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537 if (newlen < writer->min_length)
13538 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013539
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013540 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013541 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013542 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 newbuffer = PyUnicode_New(newlen, maxchar);
13544 if (newbuffer == NULL)
13545 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13547 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013548 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013549 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 }
13551 else {
13552 newbuffer = resize_compact(writer->buffer, newlen);
13553 if (newbuffer == NULL)
13554 return -1;
13555 }
13556 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013557 }
13558 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013559 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 newbuffer = PyUnicode_New(writer->size, maxchar);
13561 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13564 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013565 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013566 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013569
13570#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013571}
13572
Victor Stinnerca9381e2015-09-22 00:58:32 +020013573int
13574_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13575 enum PyUnicode_Kind kind)
13576{
13577 Py_UCS4 maxchar;
13578
13579 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13580 assert(writer->kind < kind);
13581
13582 switch (kind)
13583 {
13584 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13585 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13586 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13587 default:
13588 assert(0 && "invalid kind");
13589 return -1;
13590 }
13591
13592 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13593}
13594
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013595static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013596_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013597{
Victor Stinner2740e462016-09-06 16:58:36 -070013598 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013599 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13600 return -1;
13601 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13602 writer->pos++;
13603 return 0;
13604}
13605
13606int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013607_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13608{
13609 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13610}
13611
13612int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013613_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13614{
13615 Py_UCS4 maxchar;
13616 Py_ssize_t len;
13617
13618 if (PyUnicode_READY(str) == -1)
13619 return -1;
13620 len = PyUnicode_GET_LENGTH(str);
13621 if (len == 0)
13622 return 0;
13623 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13624 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013625 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013626 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013627 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 Py_INCREF(str);
13629 writer->buffer = str;
13630 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631 writer->pos += len;
13632 return 0;
13633 }
13634 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13635 return -1;
13636 }
13637 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13638 str, 0, len);
13639 writer->pos += len;
13640 return 0;
13641}
13642
Victor Stinnere215d962012-10-06 23:03:36 +020013643int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013644_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13645 Py_ssize_t start, Py_ssize_t end)
13646{
13647 Py_UCS4 maxchar;
13648 Py_ssize_t len;
13649
13650 if (PyUnicode_READY(str) == -1)
13651 return -1;
13652
13653 assert(0 <= start);
13654 assert(end <= PyUnicode_GET_LENGTH(str));
13655 assert(start <= end);
13656
13657 if (end == 0)
13658 return 0;
13659
13660 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13661 return _PyUnicodeWriter_WriteStr(writer, str);
13662
13663 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13664 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13665 else
13666 maxchar = writer->maxchar;
13667 len = end - start;
13668
13669 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13670 return -1;
13671
13672 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13673 str, start, len);
13674 writer->pos += len;
13675 return 0;
13676}
13677
13678int
Victor Stinner4a587072013-11-19 12:54:53 +010013679_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13680 const char *ascii, Py_ssize_t len)
13681{
13682 if (len == -1)
13683 len = strlen(ascii);
13684
13685 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13686
13687 if (writer->buffer == NULL && !writer->overallocate) {
13688 PyObject *str;
13689
13690 str = _PyUnicode_FromASCII(ascii, len);
13691 if (str == NULL)
13692 return -1;
13693
13694 writer->readonly = 1;
13695 writer->buffer = str;
13696 _PyUnicodeWriter_Update(writer);
13697 writer->pos += len;
13698 return 0;
13699 }
13700
13701 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13702 return -1;
13703
13704 switch (writer->kind)
13705 {
13706 case PyUnicode_1BYTE_KIND:
13707 {
13708 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13709 Py_UCS1 *data = writer->data;
13710
Christian Heimesf051e432016-09-13 20:22:02 +020013711 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013712 break;
13713 }
13714 case PyUnicode_2BYTE_KIND:
13715 {
13716 _PyUnicode_CONVERT_BYTES(
13717 Py_UCS1, Py_UCS2,
13718 ascii, ascii + len,
13719 (Py_UCS2 *)writer->data + writer->pos);
13720 break;
13721 }
13722 case PyUnicode_4BYTE_KIND:
13723 {
13724 _PyUnicode_CONVERT_BYTES(
13725 Py_UCS1, Py_UCS4,
13726 ascii, ascii + len,
13727 (Py_UCS4 *)writer->data + writer->pos);
13728 break;
13729 }
13730 default:
13731 assert(0);
13732 }
13733
13734 writer->pos += len;
13735 return 0;
13736}
13737
13738int
13739_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13740 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013741{
13742 Py_UCS4 maxchar;
13743
13744 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13745 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13746 return -1;
13747 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13748 writer->pos += len;
13749 return 0;
13750}
13751
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013753_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013754{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013755 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013756
Victor Stinnerd3f08822012-05-29 12:57:52 +020013757 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013758 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013759 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013761
13762 str = writer->buffer;
13763 writer->buffer = NULL;
13764
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013765 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013766 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13767 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013768 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013769
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013770 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13771 PyObject *str2;
13772 str2 = resize_compact(str, writer->pos);
13773 if (str2 == NULL) {
13774 Py_DECREF(str);
13775 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013776 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013777 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013778 }
13779
Victor Stinner15a0bd32013-07-08 22:29:55 +020013780 assert(_PyUnicode_CheckConsistency(str, 1));
13781 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013782}
13783
Victor Stinnerd3f08822012-05-29 12:57:52 +020013784void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013785_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013786{
13787 Py_CLEAR(writer->buffer);
13788}
13789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013791
13792PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013793 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013794\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013795Return a formatted version of S, using substitutions from args and kwargs.\n\
13796The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013797
Eric Smith27bbca62010-11-04 17:06:58 +000013798PyDoc_STRVAR(format_map__doc__,
13799 "S.format_map(mapping) -> str\n\
13800\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013801Return a formatted version of S, using substitutions from mapping.\n\
13802The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013803
INADA Naoki3ae20562017-01-16 20:41:20 +090013804/*[clinic input]
13805str.__format__ as unicode___format__
13806
13807 format_spec: unicode
13808 /
13809
13810Return a formatted version of the string as described by format_spec.
13811[clinic start generated code]*/
13812
Eric Smith4a7d76d2008-05-30 18:10:19 +000013813static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013814unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013815/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013816{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817 _PyUnicodeWriter writer;
13818 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013819
Victor Stinnerd3f08822012-05-29 12:57:52 +020013820 if (PyUnicode_READY(self) == -1)
13821 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013822 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013823 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13824 self, format_spec, 0,
13825 PyUnicode_GET_LENGTH(format_spec));
13826 if (ret == -1) {
13827 _PyUnicodeWriter_Dealloc(&writer);
13828 return NULL;
13829 }
13830 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013831}
13832
INADA Naoki3ae20562017-01-16 20:41:20 +090013833/*[clinic input]
13834str.__sizeof__ as unicode_sizeof
13835
13836Return the size of the string in memory, in bytes.
13837[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013838
13839static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013840unicode_sizeof_impl(PyObject *self)
13841/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013843 Py_ssize_t size;
13844
13845 /* If it's a compact object, account for base structure +
13846 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013847 if (PyUnicode_IS_COMPACT_ASCII(self))
13848 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13849 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013850 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013851 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013852 else {
13853 /* If it is a two-block object, account for base object, and
13854 for character block if present. */
13855 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013856 if (_PyUnicode_DATA_ANY(self))
13857 size += (PyUnicode_GET_LENGTH(self) + 1) *
13858 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 }
13860 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013861 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013862 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13863 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13864 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13865 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866
13867 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013868}
13869
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013870static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013871unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013872{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013873 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013874 if (!copy)
13875 return NULL;
13876 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013877}
13878
Guido van Rossumd57fd912000-03-10 22:53:23 +000013879static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 UNICODE_ENCODE_METHODDEF
13881 UNICODE_REPLACE_METHODDEF
13882 UNICODE_SPLIT_METHODDEF
13883 UNICODE_RSPLIT_METHODDEF
13884 UNICODE_JOIN_METHODDEF
13885 UNICODE_CAPITALIZE_METHODDEF
13886 UNICODE_CASEFOLD_METHODDEF
13887 UNICODE_TITLE_METHODDEF
13888 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013889 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013890 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013891 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013892 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013893 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013894 UNICODE_LJUST_METHODDEF
13895 UNICODE_LOWER_METHODDEF
13896 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013897 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13898 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013899 UNICODE_RJUST_METHODDEF
13900 UNICODE_RSTRIP_METHODDEF
13901 UNICODE_RPARTITION_METHODDEF
13902 UNICODE_SPLITLINES_METHODDEF
13903 UNICODE_STRIP_METHODDEF
13904 UNICODE_SWAPCASE_METHODDEF
13905 UNICODE_TRANSLATE_METHODDEF
13906 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13908 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013909 UNICODE_ISLOWER_METHODDEF
13910 UNICODE_ISUPPER_METHODDEF
13911 UNICODE_ISTITLE_METHODDEF
13912 UNICODE_ISSPACE_METHODDEF
13913 UNICODE_ISDECIMAL_METHODDEF
13914 UNICODE_ISDIGIT_METHODDEF
13915 UNICODE_ISNUMERIC_METHODDEF
13916 UNICODE_ISALPHA_METHODDEF
13917 UNICODE_ISALNUM_METHODDEF
13918 UNICODE_ISIDENTIFIER_METHODDEF
13919 UNICODE_ISPRINTABLE_METHODDEF
13920 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013921 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013922 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013924 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013926#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013927 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013928 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929#endif
13930
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932 {NULL, NULL}
13933};
13934
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013935static PyObject *
13936unicode_mod(PyObject *v, PyObject *w)
13937{
Brian Curtindfc80e32011-08-10 20:28:54 -050013938 if (!PyUnicode_Check(v))
13939 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013941}
13942
13943static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 0, /*nb_add*/
13945 0, /*nb_subtract*/
13946 0, /*nb_multiply*/
13947 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013948};
13949
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013951 (lenfunc) unicode_length, /* sq_length */
13952 PyUnicode_Concat, /* sq_concat */
13953 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13954 (ssizeargfunc) unicode_getitem, /* sq_item */
13955 0, /* sq_slice */
13956 0, /* sq_ass_item */
13957 0, /* sq_ass_slice */
13958 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959};
13960
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013961static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013962unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 if (PyUnicode_READY(self) == -1)
13965 return NULL;
13966
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013967 if (PyIndex_Check(item)) {
13968 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013969 if (i == -1 && PyErr_Occurred())
13970 return NULL;
13971 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013972 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013973 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013975 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013976 PyObject *result;
13977 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013978 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013979 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013981 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983 return NULL;
13984 }
13985
13986 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013987 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013988 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013989 slicelength == PyUnicode_GET_LENGTH(self)) {
13990 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013991 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013993 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013994 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013995 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013996 src_kind = PyUnicode_KIND(self);
13997 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013998 if (!PyUnicode_IS_ASCII(self)) {
13999 kind_limit = kind_maxchar_limit(src_kind);
14000 max_char = 0;
14001 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14002 ch = PyUnicode_READ(src_kind, src_data, cur);
14003 if (ch > max_char) {
14004 max_char = ch;
14005 if (max_char >= kind_limit)
14006 break;
14007 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014008 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014009 }
Victor Stinner55c99112011-10-13 01:17:06 +020014010 else
14011 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014012 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014013 if (result == NULL)
14014 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014015 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014016 dest_data = PyUnicode_DATA(result);
14017
14018 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014019 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14020 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014021 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014022 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014023 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014024 } else {
14025 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14026 return NULL;
14027 }
14028}
14029
14030static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 (lenfunc)unicode_length, /* mp_length */
14032 (binaryfunc)unicode_subscript, /* mp_subscript */
14033 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014034};
14035
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037/* Helpers for PyUnicode_Format() */
14038
Victor Stinnera47082312012-10-04 02:19:54 +020014039struct unicode_formatter_t {
14040 PyObject *args;
14041 int args_owned;
14042 Py_ssize_t arglen, argidx;
14043 PyObject *dict;
14044
14045 enum PyUnicode_Kind fmtkind;
14046 Py_ssize_t fmtcnt, fmtpos;
14047 void *fmtdata;
14048 PyObject *fmtstr;
14049
14050 _PyUnicodeWriter writer;
14051};
14052
14053struct unicode_format_arg_t {
14054 Py_UCS4 ch;
14055 int flags;
14056 Py_ssize_t width;
14057 int prec;
14058 int sign;
14059};
14060
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014062unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063{
Victor Stinnera47082312012-10-04 02:19:54 +020014064 Py_ssize_t argidx = ctx->argidx;
14065
14066 if (argidx < ctx->arglen) {
14067 ctx->argidx++;
14068 if (ctx->arglen < 0)
14069 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014070 else
Victor Stinnera47082312012-10-04 02:19:54 +020014071 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072 }
14073 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014074 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075 return NULL;
14076}
14077
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014078/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079
Victor Stinnera47082312012-10-04 02:19:54 +020014080/* Format a float into the writer if the writer is not NULL, or into *p_output
14081 otherwise.
14082
14083 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084static int
Victor Stinnera47082312012-10-04 02:19:54 +020014085formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14086 PyObject **p_output,
14087 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014088{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014089 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014091 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014092 int prec;
14093 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014094
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 x = PyFloat_AsDouble(v);
14096 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014097 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014098
Victor Stinnera47082312012-10-04 02:19:54 +020014099 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014101 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014102
Victor Stinnera47082312012-10-04 02:19:54 +020014103 if (arg->flags & F_ALT)
14104 dtoa_flags = Py_DTSF_ALT;
14105 else
14106 dtoa_flags = 0;
14107 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014108 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 return -1;
14110 len = strlen(p);
14111 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014112 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014113 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014114 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014115 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116 }
14117 else
14118 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014119 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014121}
14122
Victor Stinnerd0880d52012-04-27 23:40:13 +020014123/* formatlong() emulates the format codes d, u, o, x and X, and
14124 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14125 * Python's regular ints.
14126 * Return value: a new PyUnicodeObject*, or NULL if error.
14127 * The output string is of the form
14128 * "-"? ("0x" | "0X")? digit+
14129 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14130 * set in flags. The case of hex digits will be correct,
14131 * There will be at least prec digits, zero-filled on the left if
14132 * necessary to get that many.
14133 * val object to be converted
14134 * flags bitmask of format flags; only F_ALT is looked at
14135 * prec minimum number of digits; 0-fill on left if needed
14136 * type a character in [duoxX]; u acts the same as d
14137 *
14138 * CAUTION: o, x and X conversions on regular ints can never
14139 * produce a '-' sign, but can for Python's unbounded ints.
14140 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014141PyObject *
14142_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014143{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014144 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146 Py_ssize_t i;
14147 int sign; /* 1 if '-', else 0 */
14148 int len; /* number of characters */
14149 Py_ssize_t llen;
14150 int numdigits; /* len == numnondigits + numdigits */
14151 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014152
Victor Stinnerd0880d52012-04-27 23:40:13 +020014153 /* Avoid exceeding SSIZE_T_MAX */
14154 if (prec > INT_MAX-3) {
14155 PyErr_SetString(PyExc_OverflowError,
14156 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 }
14159
14160 assert(PyLong_Check(val));
14161
14162 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014163 default:
14164 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014165 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014166 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014168 /* int and int subclasses should print numerically when a numeric */
14169 /* format code is used (see issue18780) */
14170 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014171 break;
14172 case 'o':
14173 numnondigits = 2;
14174 result = PyNumber_ToBase(val, 8);
14175 break;
14176 case 'x':
14177 case 'X':
14178 numnondigits = 2;
14179 result = PyNumber_ToBase(val, 16);
14180 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 }
14182 if (!result)
14183 return NULL;
14184
14185 assert(unicode_modifiable(result));
14186 assert(PyUnicode_IS_READY(result));
14187 assert(PyUnicode_IS_ASCII(result));
14188
14189 /* To modify the string in-place, there can only be one reference. */
14190 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014191 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014192 PyErr_BadInternalCall();
14193 return NULL;
14194 }
14195 buf = PyUnicode_DATA(result);
14196 llen = PyUnicode_GET_LENGTH(result);
14197 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014198 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014200 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 return NULL;
14202 }
14203 len = (int)llen;
14204 sign = buf[0] == '-';
14205 numnondigits += sign;
14206 numdigits = len - numnondigits;
14207 assert(numdigits > 0);
14208
14209 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014210 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014211 (type == 'o' || type == 'x' || type == 'X'))) {
14212 assert(buf[sign] == '0');
14213 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14214 buf[sign+1] == 'o');
14215 numnondigits -= 2;
14216 buf += 2;
14217 len -= 2;
14218 if (sign)
14219 buf[0] = '-';
14220 assert(len == numnondigits + numdigits);
14221 assert(numdigits > 0);
14222 }
14223
14224 /* Fill with leading zeroes to meet minimum width. */
14225 if (prec > numdigits) {
14226 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14227 numnondigits + prec);
14228 char *b1;
14229 if (!r1) {
14230 Py_DECREF(result);
14231 return NULL;
14232 }
14233 b1 = PyBytes_AS_STRING(r1);
14234 for (i = 0; i < numnondigits; ++i)
14235 *b1++ = *buf++;
14236 for (i = 0; i < prec - numdigits; i++)
14237 *b1++ = '0';
14238 for (i = 0; i < numdigits; i++)
14239 *b1++ = *buf++;
14240 *b1 = '\0';
14241 Py_DECREF(result);
14242 result = r1;
14243 buf = PyBytes_AS_STRING(result);
14244 len = numnondigits + prec;
14245 }
14246
14247 /* Fix up case for hex conversions. */
14248 if (type == 'X') {
14249 /* Need to convert all lower case letters to upper case.
14250 and need to convert 0x to 0X (and -0x to -0X). */
14251 for (i = 0; i < len; i++)
14252 if (buf[i] >= 'a' && buf[i] <= 'x')
14253 buf[i] -= 'a'-'A';
14254 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014255 if (!PyUnicode_Check(result)
14256 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014257 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014258 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014259 Py_DECREF(result);
14260 result = unicode;
14261 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014262 else if (len != PyUnicode_GET_LENGTH(result)) {
14263 if (PyUnicode_Resize(&result, len) < 0)
14264 Py_CLEAR(result);
14265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014266 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014267}
14268
Ethan Furmandf3ed242014-01-05 06:50:30 -080014269/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014270 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014271 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014272 * -1 and raise an exception on error */
14273static int
Victor Stinnera47082312012-10-04 02:19:54 +020014274mainformatlong(PyObject *v,
14275 struct unicode_format_arg_t *arg,
14276 PyObject **p_output,
14277 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278{
14279 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014280 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281
14282 if (!PyNumber_Check(v))
14283 goto wrongtype;
14284
Ethan Furman9ab74802014-03-21 06:38:46 -070014285 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014286 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014287 if (type == 'o' || type == 'x' || type == 'X') {
14288 iobj = PyNumber_Index(v);
14289 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014290 if (PyErr_ExceptionMatches(PyExc_TypeError))
14291 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014292 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014293 }
14294 }
14295 else {
14296 iobj = PyNumber_Long(v);
14297 if (iobj == NULL ) {
14298 if (PyErr_ExceptionMatches(PyExc_TypeError))
14299 goto wrongtype;
14300 return -1;
14301 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014302 }
14303 assert(PyLong_Check(iobj));
14304 }
14305 else {
14306 iobj = v;
14307 Py_INCREF(iobj);
14308 }
14309
14310 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014311 && arg->width == -1 && arg->prec == -1
14312 && !(arg->flags & (F_SIGN | F_BLANK))
14313 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014314 {
14315 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014316 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317 int base;
14318
Victor Stinnera47082312012-10-04 02:19:54 +020014319 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014320 {
14321 default:
14322 assert(0 && "'type' not in [diuoxX]");
14323 case 'd':
14324 case 'i':
14325 case 'u':
14326 base = 10;
14327 break;
14328 case 'o':
14329 base = 8;
14330 break;
14331 case 'x':
14332 case 'X':
14333 base = 16;
14334 break;
14335 }
14336
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014337 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14338 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014339 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014340 }
14341 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014342 return 1;
14343 }
14344
Ethan Furmanb95b5612015-01-23 20:05:18 -080014345 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 Py_DECREF(iobj);
14347 if (res == NULL)
14348 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014349 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014350 return 0;
14351
14352wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014353 switch(type)
14354 {
14355 case 'o':
14356 case 'x':
14357 case 'X':
14358 PyErr_Format(PyExc_TypeError,
14359 "%%%c format: an integer is required, "
14360 "not %.200s",
14361 type, Py_TYPE(v)->tp_name);
14362 break;
14363 default:
14364 PyErr_Format(PyExc_TypeError,
14365 "%%%c format: a number is required, "
14366 "not %.200s",
14367 type, Py_TYPE(v)->tp_name);
14368 break;
14369 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014370 return -1;
14371}
14372
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014373static Py_UCS4
14374formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014376 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014377 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014378 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014379 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014380 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014381 goto onError;
14382 }
14383 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014384 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014385 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014386 /* make sure number is a type of integer */
14387 if (!PyLong_Check(v)) {
14388 iobj = PyNumber_Index(v);
14389 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014390 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014391 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014392 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 Py_DECREF(iobj);
14394 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014395 else {
14396 x = PyLong_AsLong(v);
14397 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014398 if (x == -1 && PyErr_Occurred())
14399 goto onError;
14400
Victor Stinner8faf8212011-12-08 22:14:11 +010014401 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014402 PyErr_SetString(PyExc_OverflowError,
14403 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014404 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014405 }
14406
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014407 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014408 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014409
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014411 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014413 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014414}
14415
Victor Stinnera47082312012-10-04 02:19:54 +020014416/* Parse options of an argument: flags, width, precision.
14417 Handle also "%(name)" syntax.
14418
14419 Return 0 if the argument has been formatted into arg->str.
14420 Return 1 if the argument has been written into ctx->writer,
14421 Raise an exception and return -1 on error. */
14422static int
14423unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14424 struct unicode_format_arg_t *arg)
14425{
14426#define FORMAT_READ(ctx) \
14427 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14428
14429 PyObject *v;
14430
Victor Stinnera47082312012-10-04 02:19:54 +020014431 if (arg->ch == '(') {
14432 /* Get argument value from a dictionary. Example: "%(name)s". */
14433 Py_ssize_t keystart;
14434 Py_ssize_t keylen;
14435 PyObject *key;
14436 int pcount = 1;
14437
14438 if (ctx->dict == NULL) {
14439 PyErr_SetString(PyExc_TypeError,
14440 "format requires a mapping");
14441 return -1;
14442 }
14443 ++ctx->fmtpos;
14444 --ctx->fmtcnt;
14445 keystart = ctx->fmtpos;
14446 /* Skip over balanced parentheses */
14447 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14448 arg->ch = FORMAT_READ(ctx);
14449 if (arg->ch == ')')
14450 --pcount;
14451 else if (arg->ch == '(')
14452 ++pcount;
14453 ctx->fmtpos++;
14454 }
14455 keylen = ctx->fmtpos - keystart - 1;
14456 if (ctx->fmtcnt < 0 || pcount > 0) {
14457 PyErr_SetString(PyExc_ValueError,
14458 "incomplete format key");
14459 return -1;
14460 }
14461 key = PyUnicode_Substring(ctx->fmtstr,
14462 keystart, keystart + keylen);
14463 if (key == NULL)
14464 return -1;
14465 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014466 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014467 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014468 }
14469 ctx->args = PyObject_GetItem(ctx->dict, key);
14470 Py_DECREF(key);
14471 if (ctx->args == NULL)
14472 return -1;
14473 ctx->args_owned = 1;
14474 ctx->arglen = -1;
14475 ctx->argidx = -2;
14476 }
14477
14478 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014479 while (--ctx->fmtcnt >= 0) {
14480 arg->ch = FORMAT_READ(ctx);
14481 ctx->fmtpos++;
14482 switch (arg->ch) {
14483 case '-': arg->flags |= F_LJUST; continue;
14484 case '+': arg->flags |= F_SIGN; continue;
14485 case ' ': arg->flags |= F_BLANK; continue;
14486 case '#': arg->flags |= F_ALT; continue;
14487 case '0': arg->flags |= F_ZERO; continue;
14488 }
14489 break;
14490 }
14491
14492 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014493 if (arg->ch == '*') {
14494 v = unicode_format_getnextarg(ctx);
14495 if (v == NULL)
14496 return -1;
14497 if (!PyLong_Check(v)) {
14498 PyErr_SetString(PyExc_TypeError,
14499 "* wants int");
14500 return -1;
14501 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014502 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014503 if (arg->width == -1 && PyErr_Occurred())
14504 return -1;
14505 if (arg->width < 0) {
14506 arg->flags |= F_LJUST;
14507 arg->width = -arg->width;
14508 }
14509 if (--ctx->fmtcnt >= 0) {
14510 arg->ch = FORMAT_READ(ctx);
14511 ctx->fmtpos++;
14512 }
14513 }
14514 else if (arg->ch >= '0' && arg->ch <= '9') {
14515 arg->width = arg->ch - '0';
14516 while (--ctx->fmtcnt >= 0) {
14517 arg->ch = FORMAT_READ(ctx);
14518 ctx->fmtpos++;
14519 if (arg->ch < '0' || arg->ch > '9')
14520 break;
14521 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14522 mixing signed and unsigned comparison. Since arg->ch is between
14523 '0' and '9', casting to int is safe. */
14524 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14525 PyErr_SetString(PyExc_ValueError,
14526 "width too big");
14527 return -1;
14528 }
14529 arg->width = arg->width*10 + (arg->ch - '0');
14530 }
14531 }
14532
14533 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014534 if (arg->ch == '.') {
14535 arg->prec = 0;
14536 if (--ctx->fmtcnt >= 0) {
14537 arg->ch = FORMAT_READ(ctx);
14538 ctx->fmtpos++;
14539 }
14540 if (arg->ch == '*') {
14541 v = unicode_format_getnextarg(ctx);
14542 if (v == NULL)
14543 return -1;
14544 if (!PyLong_Check(v)) {
14545 PyErr_SetString(PyExc_TypeError,
14546 "* wants int");
14547 return -1;
14548 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014549 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014550 if (arg->prec == -1 && PyErr_Occurred())
14551 return -1;
14552 if (arg->prec < 0)
14553 arg->prec = 0;
14554 if (--ctx->fmtcnt >= 0) {
14555 arg->ch = FORMAT_READ(ctx);
14556 ctx->fmtpos++;
14557 }
14558 }
14559 else if (arg->ch >= '0' && arg->ch <= '9') {
14560 arg->prec = arg->ch - '0';
14561 while (--ctx->fmtcnt >= 0) {
14562 arg->ch = FORMAT_READ(ctx);
14563 ctx->fmtpos++;
14564 if (arg->ch < '0' || arg->ch > '9')
14565 break;
14566 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14567 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014568 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014569 return -1;
14570 }
14571 arg->prec = arg->prec*10 + (arg->ch - '0');
14572 }
14573 }
14574 }
14575
14576 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14577 if (ctx->fmtcnt >= 0) {
14578 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14579 if (--ctx->fmtcnt >= 0) {
14580 arg->ch = FORMAT_READ(ctx);
14581 ctx->fmtpos++;
14582 }
14583 }
14584 }
14585 if (ctx->fmtcnt < 0) {
14586 PyErr_SetString(PyExc_ValueError,
14587 "incomplete format");
14588 return -1;
14589 }
14590 return 0;
14591
14592#undef FORMAT_READ
14593}
14594
14595/* Format one argument. Supported conversion specifiers:
14596
14597 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014598 - "i", "d", "u": int or float
14599 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014600 - "e", "E", "f", "F", "g", "G": float
14601 - "c": int or str (1 character)
14602
Victor Stinner8dbd4212012-12-04 09:30:24 +010014603 When possible, the output is written directly into the Unicode writer
14604 (ctx->writer). A string is created when padding is required.
14605
Victor Stinnera47082312012-10-04 02:19:54 +020014606 Return 0 if the argument has been formatted into *p_str,
14607 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014608 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014609static int
14610unicode_format_arg_format(struct unicode_formatter_t *ctx,
14611 struct unicode_format_arg_t *arg,
14612 PyObject **p_str)
14613{
14614 PyObject *v;
14615 _PyUnicodeWriter *writer = &ctx->writer;
14616
14617 if (ctx->fmtcnt == 0)
14618 ctx->writer.overallocate = 0;
14619
Victor Stinnera47082312012-10-04 02:19:54 +020014620 v = unicode_format_getnextarg(ctx);
14621 if (v == NULL)
14622 return -1;
14623
Victor Stinnera47082312012-10-04 02:19:54 +020014624
14625 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014626 case 's':
14627 case 'r':
14628 case 'a':
14629 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14630 /* Fast path */
14631 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14632 return -1;
14633 return 1;
14634 }
14635
14636 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14637 *p_str = v;
14638 Py_INCREF(*p_str);
14639 }
14640 else {
14641 if (arg->ch == 's')
14642 *p_str = PyObject_Str(v);
14643 else if (arg->ch == 'r')
14644 *p_str = PyObject_Repr(v);
14645 else
14646 *p_str = PyObject_ASCII(v);
14647 }
14648 break;
14649
14650 case 'i':
14651 case 'd':
14652 case 'u':
14653 case 'o':
14654 case 'x':
14655 case 'X':
14656 {
14657 int ret = mainformatlong(v, arg, p_str, writer);
14658 if (ret != 0)
14659 return ret;
14660 arg->sign = 1;
14661 break;
14662 }
14663
14664 case 'e':
14665 case 'E':
14666 case 'f':
14667 case 'F':
14668 case 'g':
14669 case 'G':
14670 if (arg->width == -1 && arg->prec == -1
14671 && !(arg->flags & (F_SIGN | F_BLANK)))
14672 {
14673 /* Fast path */
14674 if (formatfloat(v, arg, NULL, writer) == -1)
14675 return -1;
14676 return 1;
14677 }
14678
14679 arg->sign = 1;
14680 if (formatfloat(v, arg, p_str, NULL) == -1)
14681 return -1;
14682 break;
14683
14684 case 'c':
14685 {
14686 Py_UCS4 ch = formatchar(v);
14687 if (ch == (Py_UCS4) -1)
14688 return -1;
14689 if (arg->width == -1 && arg->prec == -1) {
14690 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014691 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014692 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014693 return 1;
14694 }
14695 *p_str = PyUnicode_FromOrdinal(ch);
14696 break;
14697 }
14698
14699 default:
14700 PyErr_Format(PyExc_ValueError,
14701 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014702 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014703 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14704 (int)arg->ch,
14705 ctx->fmtpos - 1);
14706 return -1;
14707 }
14708 if (*p_str == NULL)
14709 return -1;
14710 assert (PyUnicode_Check(*p_str));
14711 return 0;
14712}
14713
14714static int
14715unicode_format_arg_output(struct unicode_formatter_t *ctx,
14716 struct unicode_format_arg_t *arg,
14717 PyObject *str)
14718{
14719 Py_ssize_t len;
14720 enum PyUnicode_Kind kind;
14721 void *pbuf;
14722 Py_ssize_t pindex;
14723 Py_UCS4 signchar;
14724 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014725 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014726 Py_ssize_t sublen;
14727 _PyUnicodeWriter *writer = &ctx->writer;
14728 Py_UCS4 fill;
14729
14730 fill = ' ';
14731 if (arg->sign && arg->flags & F_ZERO)
14732 fill = '0';
14733
14734 if (PyUnicode_READY(str) == -1)
14735 return -1;
14736
14737 len = PyUnicode_GET_LENGTH(str);
14738 if ((arg->width == -1 || arg->width <= len)
14739 && (arg->prec == -1 || arg->prec >= len)
14740 && !(arg->flags & (F_SIGN | F_BLANK)))
14741 {
14742 /* Fast path */
14743 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14744 return -1;
14745 return 0;
14746 }
14747
14748 /* Truncate the string for "s", "r" and "a" formats
14749 if the precision is set */
14750 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14751 if (arg->prec >= 0 && len > arg->prec)
14752 len = arg->prec;
14753 }
14754
14755 /* Adjust sign and width */
14756 kind = PyUnicode_KIND(str);
14757 pbuf = PyUnicode_DATA(str);
14758 pindex = 0;
14759 signchar = '\0';
14760 if (arg->sign) {
14761 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14762 if (ch == '-' || ch == '+') {
14763 signchar = ch;
14764 len--;
14765 pindex++;
14766 }
14767 else if (arg->flags & F_SIGN)
14768 signchar = '+';
14769 else if (arg->flags & F_BLANK)
14770 signchar = ' ';
14771 else
14772 arg->sign = 0;
14773 }
14774 if (arg->width < len)
14775 arg->width = len;
14776
14777 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014778 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014779 if (!(arg->flags & F_LJUST)) {
14780 if (arg->sign) {
14781 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014782 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014783 }
14784 else {
14785 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014786 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014787 }
14788 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014789 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14790 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014791 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014792 }
14793
Victor Stinnera47082312012-10-04 02:19:54 +020014794 buflen = arg->width;
14795 if (arg->sign && len == arg->width)
14796 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014797 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014798 return -1;
14799
14800 /* Write the sign if needed */
14801 if (arg->sign) {
14802 if (fill != ' ') {
14803 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14804 writer->pos += 1;
14805 }
14806 if (arg->width > len)
14807 arg->width--;
14808 }
14809
14810 /* Write the numeric prefix for "x", "X" and "o" formats
14811 if the alternate form is used.
14812 For example, write "0x" for the "%#x" format. */
14813 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14814 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14815 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14816 if (fill != ' ') {
14817 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14819 writer->pos += 2;
14820 pindex += 2;
14821 }
14822 arg->width -= 2;
14823 if (arg->width < 0)
14824 arg->width = 0;
14825 len -= 2;
14826 }
14827
14828 /* Pad left with the fill character if needed */
14829 if (arg->width > len && !(arg->flags & F_LJUST)) {
14830 sublen = arg->width - len;
14831 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14832 writer->pos += sublen;
14833 arg->width = len;
14834 }
14835
14836 /* If padding with spaces: write sign if needed and/or numeric prefix if
14837 the alternate form is used */
14838 if (fill == ' ') {
14839 if (arg->sign) {
14840 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14841 writer->pos += 1;
14842 }
14843 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14844 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14845 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848 writer->pos += 2;
14849 pindex += 2;
14850 }
14851 }
14852
14853 /* Write characters */
14854 if (len) {
14855 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14856 str, pindex, len);
14857 writer->pos += len;
14858 }
14859
14860 /* Pad right with the fill character if needed */
14861 if (arg->width > len) {
14862 sublen = arg->width - len;
14863 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14864 writer->pos += sublen;
14865 }
14866 return 0;
14867}
14868
14869/* Helper of PyUnicode_Format(): format one arg.
14870 Return 0 on success, raise an exception and return -1 on error. */
14871static int
14872unicode_format_arg(struct unicode_formatter_t *ctx)
14873{
14874 struct unicode_format_arg_t arg;
14875 PyObject *str;
14876 int ret;
14877
Victor Stinner8dbd4212012-12-04 09:30:24 +010014878 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014879 if (arg.ch == '%') {
14880 ctx->fmtpos++;
14881 ctx->fmtcnt--;
14882 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14883 return -1;
14884 return 0;
14885 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014886 arg.flags = 0;
14887 arg.width = -1;
14888 arg.prec = -1;
14889 arg.sign = 0;
14890 str = NULL;
14891
Victor Stinnera47082312012-10-04 02:19:54 +020014892 ret = unicode_format_arg_parse(ctx, &arg);
14893 if (ret == -1)
14894 return -1;
14895
14896 ret = unicode_format_arg_format(ctx, &arg, &str);
14897 if (ret == -1)
14898 return -1;
14899
14900 if (ret != 1) {
14901 ret = unicode_format_arg_output(ctx, &arg, str);
14902 Py_DECREF(str);
14903 if (ret == -1)
14904 return -1;
14905 }
14906
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014907 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014908 PyErr_SetString(PyExc_TypeError,
14909 "not all arguments converted during string formatting");
14910 return -1;
14911 }
14912 return 0;
14913}
14914
Alexander Belopolsky40018472011-02-26 01:02:56 +000014915PyObject *
14916PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014917{
Victor Stinnera47082312012-10-04 02:19:54 +020014918 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014919
Guido van Rossumd57fd912000-03-10 22:53:23 +000014920 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 PyErr_BadInternalCall();
14922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014923 }
Victor Stinnera47082312012-10-04 02:19:54 +020014924
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014925 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014926 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014927
14928 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014929 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14930 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14931 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14932 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014933
Victor Stinner8f674cc2013-04-17 23:02:17 +020014934 _PyUnicodeWriter_Init(&ctx.writer);
14935 ctx.writer.min_length = ctx.fmtcnt + 100;
14936 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014937
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014939 ctx.arglen = PyTuple_Size(args);
14940 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014941 }
14942 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014943 ctx.arglen = -1;
14944 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945 }
Victor Stinnera47082312012-10-04 02:19:54 +020014946 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014947 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014948 ctx.dict = args;
14949 else
14950 ctx.dict = NULL;
14951 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952
Victor Stinnera47082312012-10-04 02:19:54 +020014953 while (--ctx.fmtcnt >= 0) {
14954 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014955 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014956
14957 nonfmtpos = ctx.fmtpos++;
14958 while (ctx.fmtcnt >= 0 &&
14959 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14960 ctx.fmtpos++;
14961 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014962 }
Victor Stinnera47082312012-10-04 02:19:54 +020014963 if (ctx.fmtcnt < 0) {
14964 ctx.fmtpos--;
14965 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014966 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014967
Victor Stinnercfc4c132013-04-03 01:48:39 +020014968 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14969 nonfmtpos, ctx.fmtpos) < 0)
14970 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014971 }
14972 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014973 ctx.fmtpos++;
14974 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014975 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014976 }
14977 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014978
Victor Stinnera47082312012-10-04 02:19:54 +020014979 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014980 PyErr_SetString(PyExc_TypeError,
14981 "not all arguments converted during string formatting");
14982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014983 }
14984
Victor Stinnera47082312012-10-04 02:19:54 +020014985 if (ctx.args_owned) {
14986 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987 }
Victor Stinnera47082312012-10-04 02:19:54 +020014988 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014989
Benjamin Peterson29060642009-01-31 22:14:21 +000014990 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014991 _PyUnicodeWriter_Dealloc(&ctx.writer);
14992 if (ctx.args_owned) {
14993 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014994 }
14995 return NULL;
14996}
14997
Jeremy Hylton938ace62002-07-17 16:30:39 +000014998static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014999unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15000
Tim Peters6d6c1a32001-08-02 04:15:00 +000015001static PyObject *
15002unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15003{
Benjamin Peterson29060642009-01-31 22:14:21 +000015004 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 static char *kwlist[] = {"object", "encoding", "errors", 0};
15006 char *encoding = NULL;
15007 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015008
Benjamin Peterson14339b62009-01-31 16:36:08 +000015009 if (type != &PyUnicode_Type)
15010 return unicode_subtype_new(type, args, kwds);
15011 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015012 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015013 return NULL;
15014 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015015 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015016 if (encoding == NULL && errors == NULL)
15017 return PyObject_Str(x);
15018 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015020}
15021
Guido van Rossume023fe02001-08-30 03:12:59 +000015022static PyObject *
15023unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15024{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015025 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015026 Py_ssize_t length, char_size;
15027 int share_wstr, share_utf8;
15028 unsigned int kind;
15029 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015030
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015032
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015033 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015034 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015036 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015037 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015038 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015039 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015040 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015042 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043 if (self == NULL) {
15044 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 return NULL;
15046 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047 kind = PyUnicode_KIND(unicode);
15048 length = PyUnicode_GET_LENGTH(unicode);
15049
15050 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015051#ifdef Py_DEBUG
15052 _PyUnicode_HASH(self) = -1;
15053#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015055#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056 _PyUnicode_STATE(self).interned = 0;
15057 _PyUnicode_STATE(self).kind = kind;
15058 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015059 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060 _PyUnicode_STATE(self).ready = 1;
15061 _PyUnicode_WSTR(self) = NULL;
15062 _PyUnicode_UTF8_LENGTH(self) = 0;
15063 _PyUnicode_UTF8(self) = NULL;
15064 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015065 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066
15067 share_utf8 = 0;
15068 share_wstr = 0;
15069 if (kind == PyUnicode_1BYTE_KIND) {
15070 char_size = 1;
15071 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15072 share_utf8 = 1;
15073 }
15074 else if (kind == PyUnicode_2BYTE_KIND) {
15075 char_size = 2;
15076 if (sizeof(wchar_t) == 2)
15077 share_wstr = 1;
15078 }
15079 else {
15080 assert(kind == PyUnicode_4BYTE_KIND);
15081 char_size = 4;
15082 if (sizeof(wchar_t) == 4)
15083 share_wstr = 1;
15084 }
15085
15086 /* Ensure we won't overflow the length. */
15087 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15088 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015089 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015091 data = PyObject_MALLOC((length + 1) * char_size);
15092 if (data == NULL) {
15093 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015094 goto onError;
15095 }
15096
Victor Stinnerc3c74152011-10-02 20:39:55 +020015097 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015098 if (share_utf8) {
15099 _PyUnicode_UTF8_LENGTH(self) = length;
15100 _PyUnicode_UTF8(self) = data;
15101 }
15102 if (share_wstr) {
15103 _PyUnicode_WSTR_LENGTH(self) = length;
15104 _PyUnicode_WSTR(self) = (wchar_t *)data;
15105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015106
Christian Heimesf051e432016-09-13 20:22:02 +020015107 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015108 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015109 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015110#ifdef Py_DEBUG
15111 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15112#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015113 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015114 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015115
15116onError:
15117 Py_DECREF(unicode);
15118 Py_DECREF(self);
15119 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015120}
15121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015122PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015123"str(object='') -> str\n\
15124str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015125\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015126Create a new string object from the given object. If encoding or\n\
15127errors is specified, then the object must expose a data buffer\n\
15128that will be decoded using the given encoding and error handler.\n\
15129Otherwise, returns the result of object.__str__() (if defined)\n\
15130or repr(object).\n\
15131encoding defaults to sys.getdefaultencoding().\n\
15132errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015133
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015134static PyObject *unicode_iter(PyObject *seq);
15135
Guido van Rossumd57fd912000-03-10 22:53:23 +000015136PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015137 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 "str", /* tp_name */
15139 sizeof(PyUnicodeObject), /* tp_size */
15140 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015141 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 (destructor)unicode_dealloc, /* tp_dealloc */
15143 0, /* tp_print */
15144 0, /* tp_getattr */
15145 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015146 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 unicode_repr, /* tp_repr */
15148 &unicode_as_number, /* tp_as_number */
15149 &unicode_as_sequence, /* tp_as_sequence */
15150 &unicode_as_mapping, /* tp_as_mapping */
15151 (hashfunc) unicode_hash, /* tp_hash*/
15152 0, /* tp_call*/
15153 (reprfunc) unicode_str, /* tp_str */
15154 PyObject_GenericGetAttr, /* tp_getattro */
15155 0, /* tp_setattro */
15156 0, /* tp_as_buffer */
15157 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015158 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 unicode_doc, /* tp_doc */
15160 0, /* tp_traverse */
15161 0, /* tp_clear */
15162 PyUnicode_RichCompare, /* tp_richcompare */
15163 0, /* tp_weaklistoffset */
15164 unicode_iter, /* tp_iter */
15165 0, /* tp_iternext */
15166 unicode_methods, /* tp_methods */
15167 0, /* tp_members */
15168 0, /* tp_getset */
15169 &PyBaseObject_Type, /* tp_base */
15170 0, /* tp_dict */
15171 0, /* tp_descr_get */
15172 0, /* tp_descr_set */
15173 0, /* tp_dictoffset */
15174 0, /* tp_init */
15175 0, /* tp_alloc */
15176 unicode_new, /* tp_new */
15177 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015178};
15179
15180/* Initialize the Unicode implementation */
15181
Victor Stinner3a50e702011-10-18 21:21:00 +020015182int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015183{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015184 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015185 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015186 0x000A, /* LINE FEED */
15187 0x000D, /* CARRIAGE RETURN */
15188 0x001C, /* FILE SEPARATOR */
15189 0x001D, /* GROUP SEPARATOR */
15190 0x001E, /* RECORD SEPARATOR */
15191 0x0085, /* NEXT LINE */
15192 0x2028, /* LINE SEPARATOR */
15193 0x2029, /* PARAGRAPH SEPARATOR */
15194 };
15195
Fred Drakee4315f52000-05-09 19:53:39 +000015196 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015197 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015198 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015199 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015200 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015201
Guido van Rossumcacfc072002-05-24 19:01:59 +000015202 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015203 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015204
15205 /* initialize the linebreak bloom filter */
15206 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015207 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015208 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015209
Christian Heimes26532f72013-07-20 14:57:16 +020015210 if (PyType_Ready(&EncodingMapType) < 0)
15211 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015212
Benjamin Petersonc4311282012-10-30 23:21:10 -040015213 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15214 Py_FatalError("Can't initialize field name iterator type");
15215
15216 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15217 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015218
Victor Stinner3a50e702011-10-18 21:21:00 +020015219 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015220}
15221
15222/* Finalize the Unicode implementation */
15223
Christian Heimesa156e092008-02-16 07:38:31 +000015224int
15225PyUnicode_ClearFreeList(void)
15226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015227 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015228}
15229
Guido van Rossumd57fd912000-03-10 22:53:23 +000015230void
Thomas Wouters78890102000-07-22 19:25:51 +000015231_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015232{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015233 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015234
Serhiy Storchaka05997252013-01-26 12:14:02 +020015235 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015236
Serhiy Storchaka05997252013-01-26 12:14:02 +020015237 for (i = 0; i < 256; i++)
15238 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015239 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015240 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015241}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015242
Walter Dörwald16807132007-05-25 13:52:07 +000015243void
15244PyUnicode_InternInPlace(PyObject **p)
15245{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015246 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015247 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015248#ifdef Py_DEBUG
15249 assert(s != NULL);
15250 assert(_PyUnicode_CHECK(s));
15251#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015252 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015253 return;
15254#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 /* If it's a subclass, we don't really know what putting
15256 it in the interned dict might do. */
15257 if (!PyUnicode_CheckExact(s))
15258 return;
15259 if (PyUnicode_CHECK_INTERNED(s))
15260 return;
15261 if (interned == NULL) {
15262 interned = PyDict_New();
15263 if (interned == NULL) {
15264 PyErr_Clear(); /* Don't leave an exception */
15265 return;
15266 }
15267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015269 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015271 if (t == NULL) {
15272 PyErr_Clear();
15273 return;
15274 }
15275 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015276 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015277 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015278 return;
15279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 /* The two references in interned are not counted by refcnt.
15281 The deallocator will take care of this */
15282 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015283 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015284}
15285
15286void
15287PyUnicode_InternImmortal(PyObject **p)
15288{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 PyUnicode_InternInPlace(p);
15290 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015291 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 Py_INCREF(*p);
15293 }
Walter Dörwald16807132007-05-25 13:52:07 +000015294}
15295
15296PyObject *
15297PyUnicode_InternFromString(const char *cp)
15298{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 PyObject *s = PyUnicode_FromString(cp);
15300 if (s == NULL)
15301 return NULL;
15302 PyUnicode_InternInPlace(&s);
15303 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015304}
15305
Alexander Belopolsky40018472011-02-26 01:02:56 +000015306void
15307_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015310 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 Py_ssize_t i, n;
15312 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015313
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 if (interned == NULL || !PyDict_Check(interned))
15315 return;
15316 keys = PyDict_Keys(interned);
15317 if (keys == NULL || !PyList_Check(keys)) {
15318 PyErr_Clear();
15319 return;
15320 }
Walter Dörwald16807132007-05-25 13:52:07 +000015321
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15323 detector, interned unicode strings are not forcibly deallocated;
15324 rather, we give them their stolen references back, and then clear
15325 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015326
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 n = PyList_GET_SIZE(keys);
15328 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015329 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015330 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015331 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015332 if (PyUnicode_READY(s) == -1) {
15333 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015336 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 case SSTATE_NOT_INTERNED:
15338 /* XXX Shouldn't happen */
15339 break;
15340 case SSTATE_INTERNED_IMMORTAL:
15341 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015342 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 break;
15344 case SSTATE_INTERNED_MORTAL:
15345 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015346 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 break;
15348 default:
15349 Py_FatalError("Inconsistent interned string state.");
15350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 }
15353 fprintf(stderr, "total size of all interned strings: "
15354 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15355 "mortal/immortal\n", mortal_size, immortal_size);
15356 Py_DECREF(keys);
15357 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015358 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015359}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360
15361
15362/********************* Unicode Iterator **************************/
15363
15364typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 PyObject_HEAD
15366 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015367 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368} unicodeiterobject;
15369
15370static void
15371unicodeiter_dealloc(unicodeiterobject *it)
15372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 _PyObject_GC_UNTRACK(it);
15374 Py_XDECREF(it->it_seq);
15375 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015376}
15377
15378static int
15379unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15380{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 Py_VISIT(it->it_seq);
15382 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015383}
15384
15385static PyObject *
15386unicodeiter_next(unicodeiterobject *it)
15387{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015388 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015389
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 assert(it != NULL);
15391 seq = it->it_seq;
15392 if (seq == NULL)
15393 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015394 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015396 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15397 int kind = PyUnicode_KIND(seq);
15398 void *data = PyUnicode_DATA(seq);
15399 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15400 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015401 if (item != NULL)
15402 ++it->it_index;
15403 return item;
15404 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015407 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409}
15410
15411static PyObject *
15412unicodeiter_len(unicodeiterobject *it)
15413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 Py_ssize_t len = 0;
15415 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015416 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418}
15419
15420PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15421
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015422static PyObject *
15423unicodeiter_reduce(unicodeiterobject *it)
15424{
15425 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015426 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015427 it->it_seq, it->it_index);
15428 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015429 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015430 if (u == NULL)
15431 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015432 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015433 }
15434}
15435
15436PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15437
15438static PyObject *
15439unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15440{
15441 Py_ssize_t index = PyLong_AsSsize_t(state);
15442 if (index == -1 && PyErr_Occurred())
15443 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015444 if (it->it_seq != NULL) {
15445 if (index < 0)
15446 index = 0;
15447 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15448 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15449 it->it_index = index;
15450 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 Py_RETURN_NONE;
15452}
15453
15454PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15455
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015456static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015458 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015459 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15460 reduce_doc},
15461 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15462 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464};
15465
15466PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015467 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15468 "str_iterator", /* tp_name */
15469 sizeof(unicodeiterobject), /* tp_basicsize */
15470 0, /* tp_itemsize */
15471 /* methods */
15472 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15473 0, /* tp_print */
15474 0, /* tp_getattr */
15475 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015476 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 0, /* tp_repr */
15478 0, /* tp_as_number */
15479 0, /* tp_as_sequence */
15480 0, /* tp_as_mapping */
15481 0, /* tp_hash */
15482 0, /* tp_call */
15483 0, /* tp_str */
15484 PyObject_GenericGetAttr, /* tp_getattro */
15485 0, /* tp_setattro */
15486 0, /* tp_as_buffer */
15487 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15488 0, /* tp_doc */
15489 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15490 0, /* tp_clear */
15491 0, /* tp_richcompare */
15492 0, /* tp_weaklistoffset */
15493 PyObject_SelfIter, /* tp_iter */
15494 (iternextfunc)unicodeiter_next, /* tp_iternext */
15495 unicodeiter_methods, /* tp_methods */
15496 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015497};
15498
15499static PyObject *
15500unicode_iter(PyObject *seq)
15501{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015503
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 if (!PyUnicode_Check(seq)) {
15505 PyErr_BadInternalCall();
15506 return NULL;
15507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015508 if (PyUnicode_READY(seq) == -1)
15509 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15511 if (it == NULL)
15512 return NULL;
15513 it->it_index = 0;
15514 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015515 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015516 _PyObject_GC_TRACK(it);
15517 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015518}
15519
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015520
15521size_t
15522Py_UNICODE_strlen(const Py_UNICODE *u)
15523{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015524 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015525}
15526
15527Py_UNICODE*
15528Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15529{
15530 Py_UNICODE *u = s1;
15531 while ((*u++ = *s2++));
15532 return s1;
15533}
15534
15535Py_UNICODE*
15536Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15537{
15538 Py_UNICODE *u = s1;
15539 while ((*u++ = *s2++))
15540 if (n-- == 0)
15541 break;
15542 return s1;
15543}
15544
15545Py_UNICODE*
15546Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15547{
15548 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015549 u1 += wcslen(u1);
15550 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015551 return s1;
15552}
15553
15554int
15555Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15556{
15557 while (*s1 && *s2 && *s1 == *s2)
15558 s1++, s2++;
15559 if (*s1 && *s2)
15560 return (*s1 < *s2) ? -1 : +1;
15561 if (*s1)
15562 return 1;
15563 if (*s2)
15564 return -1;
15565 return 0;
15566}
15567
15568int
15569Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15570{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015571 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015572 for (; n != 0; n--) {
15573 u1 = *s1;
15574 u2 = *s2;
15575 if (u1 != u2)
15576 return (u1 < u2) ? -1 : +1;
15577 if (u1 == '\0')
15578 return 0;
15579 s1++;
15580 s2++;
15581 }
15582 return 0;
15583}
15584
15585Py_UNICODE*
15586Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15587{
15588 const Py_UNICODE *p;
15589 for (p = s; *p; p++)
15590 if (*p == c)
15591 return (Py_UNICODE*)p;
15592 return NULL;
15593}
15594
15595Py_UNICODE*
15596Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15597{
15598 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015599 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015600 while (p != s) {
15601 p--;
15602 if (*p == c)
15603 return (Py_UNICODE*)p;
15604 }
15605 return NULL;
15606}
Victor Stinner331ea922010-08-10 16:37:20 +000015607
Victor Stinner71133ff2010-09-01 23:43:53 +000015608Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015609PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015610{
Victor Stinner577db2c2011-10-11 22:12:48 +020015611 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015612 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015614 if (!PyUnicode_Check(unicode)) {
15615 PyErr_BadArgument();
15616 return NULL;
15617 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015618 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015619 if (u == NULL)
15620 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015621 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015622 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015623 PyErr_NoMemory();
15624 return NULL;
15625 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015626 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015627 size *= sizeof(Py_UNICODE);
15628 copy = PyMem_Malloc(size);
15629 if (copy == NULL) {
15630 PyErr_NoMemory();
15631 return NULL;
15632 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015633 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015634 return copy;
15635}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015636
Georg Brandl66c221e2010-10-14 07:04:07 +000015637/* A _string module, to export formatter_parser and formatter_field_name_split
15638 to the string.Formatter class implemented in Python. */
15639
15640static PyMethodDef _string_methods[] = {
15641 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15642 METH_O, PyDoc_STR("split the argument as a field name")},
15643 {"formatter_parser", (PyCFunction) formatter_parser,
15644 METH_O, PyDoc_STR("parse the argument as a format string")},
15645 {NULL, NULL}
15646};
15647
15648static struct PyModuleDef _string_module = {
15649 PyModuleDef_HEAD_INIT,
15650 "_string",
15651 PyDoc_STR("string helper module"),
15652 0,
15653 _string_methods,
15654 NULL,
15655 NULL,
15656 NULL,
15657 NULL
15658};
15659
15660PyMODINIT_FUNC
15661PyInit__string(void)
15662{
15663 return PyModule_Create(&_string_module);
15664}
15665
15666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015667#ifdef __cplusplus
15668}
15669#endif