blob: b4145719b864a17a9f5798ae37cf4c69cec0135e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2957 convert a Unicode object to a wide character string.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) required to convert the unicode object. Ignore size argument.
2961
Victor Stinnerd88d9832011-09-06 02:00:05 +02002962 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002964 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002965static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002966unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002967 wchar_t *w,
2968 Py_ssize_t size)
2969{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 const wchar_t *wstr;
2972
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002973 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002974 if (wstr == NULL)
2975 return -1;
2976
Victor Stinner5593d8a2010-10-02 11:11:27 +00002977 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (size > res)
2979 size = res + 1;
2980 else
2981 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002982 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 return res;
2984 }
2985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002987}
2988
2989Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002990PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002991 wchar_t *w,
2992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyErr_BadInternalCall();
2996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002998 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999}
3000
Victor Stinner137c34c2010-09-29 10:25:54 +00003001wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003002PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 Py_ssize_t *size)
3004{
3005 wchar_t* buffer;
3006 Py_ssize_t buflen;
3007
3008 if (unicode == NULL) {
3009 PyErr_BadInternalCall();
3010 return NULL;
3011 }
3012
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003013 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 if (buflen == -1)
3015 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003016 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 if (buffer == NULL) {
3018 PyErr_NoMemory();
3019 return NULL;
3020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003021 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003022 if (buflen == -1) {
3023 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003025 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003026 if (size != NULL)
3027 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003028 return buffer;
3029}
3030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
3034PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035{
Victor Stinner8faf8212011-12-08 22:14:11 +01003036 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 PyErr_SetString(PyExc_ValueError,
3038 "chr() arg not in range(0x110000)");
3039 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003041
Victor Stinner985a82a2014-01-03 12:53:47 +01003042 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003046PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003051 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003052 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 Py_INCREF(obj);
3054 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
3056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 /* For a Unicode subtype that's not a Unicode object,
3058 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003059 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 PyErr_Format(PyExc_TypeError,
3062 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003063 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003064 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003068PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003072 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003073 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003074
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 PyErr_BadInternalCall();
3077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 /* Decoding bytes objects is the most common case and should be fast */
3081 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003082 if (PyBytes_GET_SIZE(obj) == 0)
3083 _Py_RETURN_UNICODE_EMPTY();
3084 v = PyUnicode_Decode(
3085 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3086 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003087 return v;
3088 }
3089
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 PyErr_SetString(PyExc_TypeError,
3092 "decoding str is not supported");
3093 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003095
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003096 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3097 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3098 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003099 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003100 Py_TYPE(obj)->tp_name);
3101 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003102 }
Tim Petersced69f82003-09-16 20:30:58 +00003103
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003104 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003105 PyBuffer_Release(&buffer);
3106 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003108
Serhiy Storchaka05997252013-01-26 12:14:02 +02003109 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003111 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
Victor Stinnerebe17e02016-10-12 13:57:45 +02003114/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3115 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3116 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003117int
3118_Py_normalize_encoding(const char *encoding,
3119 char *lower,
3120 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003123 char *l;
3124 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003125 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126
Victor Stinner942889a2016-09-05 15:40:10 -07003127 assert(encoding != NULL);
3128
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003129 e = encoding;
3130 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003131 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003132 punct = 0;
3133 while (1) {
3134 char c = *e;
3135 if (c == 0) {
3136 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003137 }
Victor Stinner942889a2016-09-05 15:40:10 -07003138
3139 if (Py_ISALNUM(c) || c == '.') {
3140 if (punct && l != lower) {
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = '_';
3145 }
3146 punct = 0;
3147
3148 if (l == l_end) {
3149 return 0;
3150 }
3151 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
3153 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003154 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
Victor Stinner942889a2016-09-05 15:40:10 -07003156
3157 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003160 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 Py_ssize_t size,
3166 const char *encoding,
3167 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003168{
3169 PyObject *buffer = NULL, *unicode;
3170 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003171 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3172
3173 if (encoding == NULL) {
3174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3175 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003176
Fred Drakee4315f52000-05-09 19:53:39 +00003177 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003178 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3179 char *lower = buflower;
3180
3181 /* Fast paths */
3182 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3183 lower += 3;
3184 if (*lower == '_') {
3185 /* Match "utf8" and "utf_8" */
3186 lower++;
3187 }
3188
3189 if (lower[0] == '8' && lower[1] == 0) {
3190 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3191 }
3192 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3194 }
3195 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3196 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3197 }
3198 }
3199 else {
3200 if (strcmp(lower, "ascii") == 0
3201 || strcmp(lower, "us_ascii") == 0) {
3202 return PyUnicode_DecodeASCII(s, size, errors);
3203 }
Steve Dowercc16be82016-09-08 10:35:16 -07003204 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003205 else if (strcmp(lower, "mbcs") == 0) {
3206 return PyUnicode_DecodeMBCS(s, size, errors);
3207 }
3208 #endif
3209 else if (strcmp(lower, "latin1") == 0
3210 || strcmp(lower, "latin_1") == 0
3211 || strcmp(lower, "iso_8859_1") == 0
3212 || strcmp(lower, "iso8859_1") == 0) {
3213 return PyUnicode_DecodeLatin1(s, size, errors);
3214 }
3215 }
Victor Stinner37296e82010-06-10 13:36:23 +00003216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217
3218 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003219 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003220 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003221 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003222 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (buffer == NULL)
3224 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003225 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 if (unicode == NULL)
3227 goto onError;
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003230 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3231 "use codecs.decode() to decode to arbitrary types",
3232 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003233 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_DECREF(unicode);
3235 goto onError;
3236 }
3237 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003238 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_XDECREF(buffer);
3242 return NULL;
3243}
3244
Alexander Belopolsky40018472011-02-26 01:02:56 +00003245PyObject *
3246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003247 const char *encoding,
3248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003249{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003252 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 }
3254
Serhiy Storchaka00939072016-10-27 21:05:49 +03003255 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3256 "PyUnicode_AsDecodedObject() is deprecated; "
3257 "use PyCodec_Decode() to decode from str", 1) < 0)
3258 return NULL;
3259
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262
3263 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003264 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003265}
3266
Alexander Belopolsky40018472011-02-26 01:02:56 +00003267PyObject *
3268PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003269 const char *encoding,
3270 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003271{
3272 PyObject *v;
3273
3274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
3276 goto onError;
3277 }
3278
Serhiy Storchaka00939072016-10-27 21:05:49 +03003279 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3280 "PyUnicode_AsDecodedUnicode() is deprecated; "
3281 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3282 return NULL;
3283
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003286
3287 /* Decode via the codec registry */
3288 v = PyCodec_Decode(unicode, encoding, errors);
3289 if (v == NULL)
3290 goto onError;
3291 if (!PyUnicode_Check(v)) {
3292 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003293 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3294 "use codecs.decode() to decode to arbitrary types",
3295 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003296 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003297 Py_DECREF(v);
3298 goto onError;
3299 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003300 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003301
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 return NULL;
3304}
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 Py_ssize_t size,
3309 const char *encoding,
3310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311{
3312 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003314 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3318 Py_DECREF(unicode);
3319 return v;
3320}
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322PyObject *
3323PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding,
3325 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003326{
3327 PyObject *v;
3328
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 goto onError;
3332 }
3333
Serhiy Storchaka00939072016-10-27 21:05:49 +03003334 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3335 "PyUnicode_AsEncodedObject() is deprecated; "
3336 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3337 "or PyCodec_Encode() for generic encoding", 1) < 0)
3338 return NULL;
3339
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003340 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003342
3343 /* Encode via the codec registry */
3344 v = PyCodec_Encode(unicode, encoding, errors);
3345 if (v == NULL)
3346 goto onError;
3347 return v;
3348
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003350 return NULL;
3351}
3352
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353static size_t
3354wcstombs_errorpos(const wchar_t *wstr)
3355{
3356 size_t len;
3357#if SIZEOF_WCHAR_T == 2
3358 wchar_t buf[3];
3359#else
3360 wchar_t buf[2];
3361#endif
3362 char outbuf[MB_LEN_MAX];
3363 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365#if SIZEOF_WCHAR_T == 2
3366 buf[2] = 0;
3367#else
3368 buf[1] = 0;
3369#endif
3370 start = wstr;
3371 while (*wstr != L'\0')
3372 {
3373 previous = wstr;
3374#if SIZEOF_WCHAR_T == 2
3375 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3376 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3377 {
3378 buf[0] = wstr[0];
3379 buf[1] = wstr[1];
3380 wstr += 2;
3381 }
3382 else {
3383 buf[0] = *wstr;
3384 buf[1] = 0;
3385 wstr++;
3386 }
3387#else
3388 buf[0] = *wstr;
3389 wstr++;
3390#endif
3391 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003392 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 }
3395
3396 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return 0;
3398}
3399
Victor Stinner1b579672011-12-17 05:47:23 +01003400static int
3401locale_error_handler(const char *errors, int *surrogateescape)
3402{
Victor Stinner50149202015-09-22 00:26:54 +02003403 _Py_error_handler error_handler = get_error_handler(errors);
3404 switch (error_handler)
3405 {
3406 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 0;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003410 *surrogateescape = 1;
3411 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003412 default:
3413 PyErr_Format(PyExc_ValueError,
3414 "only 'strict' and 'surrogateescape' error handlers "
3415 "are supported, not '%s'",
3416 errors);
3417 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003418 }
Victor Stinner1b579672011-12-17 05:47:23 +01003419}
3420
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003422PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423{
3424 Py_ssize_t wlen, wlen2;
3425 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003427 PyObject *bytes, *reason, *exc;
3428 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003429 int surrogateescape;
3430
3431 if (locale_error_handler(errors, &surrogateescape) < 0)
3432 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433
3434 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3435 if (wstr == NULL)
3436 return NULL;
3437
3438 wlen2 = wcslen(wstr);
3439 if (wlen2 != wlen) {
3440 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003441 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 return NULL;
3443 }
3444
3445 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003446 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 char *str;
3448
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003449 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 if (str == NULL) {
3451 if (error_pos == (size_t)-1) {
3452 PyErr_NoMemory();
3453 PyMem_Free(wstr);
3454 return NULL;
3455 }
3456 else {
3457 goto encode_error;
3458 }
3459 }
3460 PyMem_Free(wstr);
3461
3462 bytes = PyBytes_FromString(str);
3463 PyMem_Free(str);
3464 }
3465 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003466 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 size_t len, len2;
3468
3469 len = wcstombs(NULL, wstr, 0);
3470 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003471 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003472 goto encode_error;
3473 }
3474
3475 bytes = PyBytes_FromStringAndSize(NULL, len);
3476 if (bytes == NULL) {
3477 PyMem_Free(wstr);
3478 return NULL;
3479 }
3480
3481 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3482 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003483 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003484 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003485 goto encode_error;
3486 }
3487 PyMem_Free(wstr);
3488 }
3489 return bytes;
3490
3491encode_error:
3492 errmsg = strerror(errno);
3493 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003494
3495 if (error_pos == (size_t)-1)
3496 error_pos = wcstombs_errorpos(wstr);
3497
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500 wstr = Py_DecodeLocale(errmsg, &errlen);
3501 if (wstr != NULL) {
3502 reason = PyUnicode_FromWideChar(wstr, errlen);
3503 PyMem_RawFree(wstr);
3504 } else {
3505 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003506 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003507
Victor Stinner2f197072011-12-17 07:08:30 +01003508 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003509 reason = PyUnicode_FromString(
3510 "wcstombs() encountered an unencodable "
3511 "wide character");
3512 if (reason == NULL)
3513 return NULL;
3514
3515 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3516 "locale", unicode,
3517 (Py_ssize_t)error_pos,
3518 (Py_ssize_t)(error_pos+1),
3519 reason);
3520 Py_DECREF(reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003523 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003524 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 return NULL;
3526}
3527
Victor Stinnerad158722010-10-27 00:25:46 +00003528PyObject *
3529PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003530{
Steve Dowercc16be82016-09-08 10:35:16 -07003531#if defined(__APPLE__)
3532 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003533#else
Victor Stinner793b5312011-04-27 00:24:21 +02003534 PyInterpreterState *interp = PyThreadState_GET()->interp;
3535 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3536 cannot use it to encode and decode filenames before it is loaded. Load
3537 the Python codec requires to encode at least its own filename. Use the C
3538 version of the locale codec until the codec registry is initialized and
3539 the Python codec is loaded.
3540
3541 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3542 cannot only rely on it: check also interp->fscodec_initialized for
3543 subinterpreters. */
3544 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545 return PyUnicode_AsEncodedString(unicode,
3546 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003547 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
3549 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003550 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003551 }
Victor Stinnerad158722010-10-27 00:25:46 +00003552#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555PyObject *
3556PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003557 const char *encoding,
3558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559{
3560 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003561 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Fred Drakee4315f52000-05-09 19:53:39 +00003567
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (encoding == NULL) {
3569 return _PyUnicode_AsUTF8String(unicode, errors);
3570 }
3571
Fred Drakee4315f52000-05-09 19:53:39 +00003572 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003573 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3574 char *lower = buflower;
3575
3576 /* Fast paths */
3577 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3578 lower += 3;
3579 if (*lower == '_') {
3580 /* Match "utf8" and "utf_8" */
3581 lower++;
3582 }
3583
3584 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003586 }
3587 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3589 }
3590 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3591 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3592 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003593 }
Victor Stinner942889a2016-09-05 15:40:10 -07003594 else {
3595 if (strcmp(lower, "ascii") == 0
3596 || strcmp(lower, "us_ascii") == 0) {
3597 return _PyUnicode_AsASCIIString(unicode, errors);
3598 }
Steve Dowercc16be82016-09-08 10:35:16 -07003599#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003600 else if (strcmp(lower, "mbcs") == 0) {
3601 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3602 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003603#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003604 else if (strcmp(lower, "latin1") == 0 ||
3605 strcmp(lower, "latin_1") == 0 ||
3606 strcmp(lower, "iso_8859_1") == 0 ||
3607 strcmp(lower, "iso8859_1") == 0) {
3608 return _PyUnicode_AsLatin1String(unicode, errors);
3609 }
3610 }
Victor Stinner37296e82010-06-10 13:36:23 +00003611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612
3613 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003614 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003616 return NULL;
3617
3618 /* The normal path */
3619 if (PyBytes_Check(v))
3620 return v;
3621
3622 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003623 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003624 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003626
3627 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003628 "encoder %s returned bytearray instead of bytes; "
3629 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003630 encoding);
3631 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003632 Py_DECREF(v);
3633 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003636 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3637 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003638 Py_DECREF(v);
3639 return b;
3640 }
3641
3642 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003643 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3644 "use codecs.encode() to encode to arbitrary types",
3645 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003646 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003647 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003648 return NULL;
3649}
3650
Alexander Belopolsky40018472011-02-26 01:02:56 +00003651PyObject *
3652PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003653 const char *encoding,
3654 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003655{
3656 PyObject *v;
3657
3658 if (!PyUnicode_Check(unicode)) {
3659 PyErr_BadArgument();
3660 goto onError;
3661 }
3662
Serhiy Storchaka00939072016-10-27 21:05:49 +03003663 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3664 "PyUnicode_AsEncodedUnicode() is deprecated; "
3665 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3666 return NULL;
3667
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003668 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003670
3671 /* Encode via the codec registry */
3672 v = PyCodec_Encode(unicode, encoding, errors);
3673 if (v == NULL)
3674 goto onError;
3675 if (!PyUnicode_Check(v)) {
3676 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003677 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3678 "use codecs.encode() to encode to arbitrary types",
3679 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003680 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003681 Py_DECREF(v);
3682 goto onError;
3683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003685
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 return NULL;
3688}
3689
Victor Stinner2f197072011-12-17 07:08:30 +01003690static size_t
3691mbstowcs_errorpos(const char *str, size_t len)
3692{
3693#ifdef HAVE_MBRTOWC
3694 const char *start = str;
3695 mbstate_t mbs;
3696 size_t converted;
3697 wchar_t ch;
3698
3699 memset(&mbs, 0, sizeof mbs);
3700 while (len)
3701 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003702 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003703 if (converted == 0)
3704 /* Reached end of string */
3705 break;
3706 if (converted == (size_t)-1 || converted == (size_t)-2) {
3707 /* Conversion error or incomplete character */
3708 return str - start;
3709 }
3710 else {
3711 str += converted;
3712 len -= converted;
3713 }
3714 }
3715 /* failed to find the undecodable byte sequence */
3716 return 0;
3717#endif
3718 return 0;
3719}
3720
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003721PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003723 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003724{
3725 wchar_t smallbuf[256];
3726 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3727 wchar_t *wstr;
3728 size_t wlen, wlen2;
3729 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003730 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003731 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003732 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003733 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003734
3735 if (locale_error_handler(errors, &surrogateescape) < 0)
3736 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003738 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3739 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003740 return NULL;
3741 }
3742
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003743 if (surrogateescape) {
3744 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003745 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003746 if (wstr == NULL) {
3747 if (wlen == (size_t)-1)
3748 PyErr_NoMemory();
3749 else
3750 PyErr_SetFromErrno(PyExc_OSError);
3751 return NULL;
3752 }
3753
3754 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003755 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756 }
3757 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003758 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003759#ifndef HAVE_BROKEN_MBSTOWCS
3760 wlen = mbstowcs(NULL, str, 0);
3761#else
3762 wlen = len;
3763#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003764 if (wlen == (size_t)-1)
3765 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003766 if (wlen+1 <= smallbuf_len) {
3767 wstr = smallbuf;
3768 }
3769 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003770 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003771 if (!wstr)
3772 return PyErr_NoMemory();
3773 }
3774
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003775 wlen2 = mbstowcs(wstr, str, wlen+1);
3776 if (wlen2 == (size_t)-1) {
3777 if (wstr != smallbuf)
3778 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003779 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003780 }
3781#ifdef HAVE_BROKEN_MBSTOWCS
3782 assert(wlen2 == wlen);
3783#endif
3784 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3785 if (wstr != smallbuf)
3786 PyMem_Free(wstr);
3787 }
3788 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003789
3790decode_error:
3791 errmsg = strerror(errno);
3792 assert(errmsg != NULL);
3793
3794 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003795 wstr = Py_DecodeLocale(errmsg, &errlen);
3796 if (wstr != NULL) {
3797 reason = PyUnicode_FromWideChar(wstr, errlen);
3798 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003799 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003800
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003801 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003802 reason = PyUnicode_FromString(
3803 "mbstowcs() encountered an invalid multibyte sequence");
3804 if (reason == NULL)
3805 return NULL;
3806
3807 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3808 "locale", str, len,
3809 (Py_ssize_t)error_pos,
3810 (Py_ssize_t)(error_pos+1),
3811 reason);
3812 Py_DECREF(reason);
3813 if (exc != NULL) {
3814 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003815 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003816 }
3817 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003818}
3819
3820PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003821PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003822{
3823 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003824 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003825}
3826
3827
3828PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003830 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003831 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3832}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003833
Christian Heimes5894ba72007-11-04 11:43:14 +00003834PyObject*
3835PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3836{
Steve Dowercc16be82016-09-08 10:35:16 -07003837#if defined(__APPLE__)
3838 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003839#else
Victor Stinner793b5312011-04-27 00:24:21 +02003840 PyInterpreterState *interp = PyThreadState_GET()->interp;
3841 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3842 cannot use it to encode and decode filenames before it is loaded. Load
3843 the Python codec requires to encode at least its own filename. Use the C
3844 version of the locale codec until the codec registry is initialized and
3845 the Python codec is loaded.
3846
3847 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3848 cannot only rely on it: check also interp->fscodec_initialized for
3849 subinterpreters. */
3850 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003851 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003853 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854 }
3855 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003856 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003857 }
Victor Stinnerad158722010-10-27 00:25:46 +00003858#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003859}
3860
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861
3862int
3863PyUnicode_FSConverter(PyObject* arg, void* addr)
3864{
Brett Cannonec6ce872016-09-06 15:50:29 -07003865 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003866 PyObject *output = NULL;
3867 Py_ssize_t size;
3868 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003869 if (arg == NULL) {
3870 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003871 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003872 return 1;
3873 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003874 path = PyOS_FSPath(arg);
3875 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003876 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003877 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003878 if (PyBytes_Check(path)) {
3879 output = path;
3880 }
3881 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3882 output = PyUnicode_EncodeFSDefault(path);
3883 Py_DECREF(path);
3884 if (!output) {
3885 return 0;
3886 }
3887 assert(PyBytes_Check(output));
3888 }
3889
Victor Stinner0ea2a462010-04-30 00:22:08 +00003890 size = PyBytes_GET_SIZE(output);
3891 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003892 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003893 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894 Py_DECREF(output);
3895 return 0;
3896 }
3897 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003898 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003899}
3900
3901
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902int
3903PyUnicode_FSDecoder(PyObject* arg, void* addr)
3904{
Brett Cannona5711202016-09-06 19:36:01 -07003905 int is_buffer = 0;
3906 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003908 if (arg == NULL) {
3909 Py_DECREF(*(PyObject**)addr);
3910 return 1;
3911 }
Brett Cannona5711202016-09-06 19:36:01 -07003912
3913 is_buffer = PyObject_CheckBuffer(arg);
3914 if (!is_buffer) {
3915 path = PyOS_FSPath(arg);
3916 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003917 return 0;
3918 }
Brett Cannona5711202016-09-06 19:36:01 -07003919 }
3920 else {
3921 path = arg;
3922 Py_INCREF(arg);
3923 }
3924
3925 if (PyUnicode_Check(path)) {
3926 if (PyUnicode_READY(path) == -1) {
3927 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003929 }
3930 output = path;
3931 }
3932 else if (PyBytes_Check(path) || is_buffer) {
3933 PyObject *path_bytes = NULL;
3934
3935 if (!PyBytes_Check(path) &&
3936 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3937 "path should be string, bytes, or os.PathLike, not %.200s",
3938 Py_TYPE(arg)->tp_name)) {
3939 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003940 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003941 }
3942 path_bytes = PyBytes_FromObject(path);
3943 Py_DECREF(path);
3944 if (!path_bytes) {
3945 return 0;
3946 }
3947 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3948 PyBytes_GET_SIZE(path_bytes));
3949 Py_DECREF(path_bytes);
3950 if (!output) {
3951 return 0;
3952 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003953 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003954 else {
3955 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003956 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003957 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003958 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003959 return 0;
3960 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003961 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003962 Py_DECREF(output);
3963 return 0;
3964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003966 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003967 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003968 Py_DECREF(output);
3969 return 0;
3970 }
3971 *(PyObject**)addr = output;
3972 return Py_CLEANUP_SUPPORTED;
3973}
3974
3975
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003976const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003978{
Christian Heimesf3863112007-11-22 07:46:41 +00003979 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003981 if (!PyUnicode_Check(unicode)) {
3982 PyErr_BadArgument();
3983 return NULL;
3984 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003985 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003986 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003988 if (PyUnicode_UTF8(unicode) == NULL) {
3989 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003990 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 if (bytes == NULL)
3992 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3994 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003995 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 Py_DECREF(bytes);
3997 return NULL;
3998 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004000 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 PyBytes_AS_STRING(bytes),
4002 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 Py_DECREF(bytes);
4004 }
4005
4006 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004007 *psize = PyUnicode_UTF8_LENGTH(unicode);
4008 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004009}
4010
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004011const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4015}
4016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017Py_UNICODE *
4018PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020 const unsigned char *one_byte;
4021#if SIZEOF_WCHAR_T == 4
4022 const Py_UCS2 *two_bytes;
4023#else
4024 const Py_UCS4 *four_bytes;
4025 const Py_UCS4 *ucs4_end;
4026 Py_ssize_t num_surrogates;
4027#endif
4028 wchar_t *w;
4029 wchar_t *wchar_end;
4030
4031 if (!PyUnicode_Check(unicode)) {
4032 PyErr_BadArgument();
4033 return NULL;
4034 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004035 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004037 assert(_PyUnicode_KIND(unicode) != 0);
4038 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004042 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4043 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 num_surrogates = 0;
4045
4046 for (; four_bytes < ucs4_end; ++four_bytes) {
4047 if (*four_bytes > 0xFFFF)
4048 ++num_surrogates;
4049 }
4050
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004051 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4052 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4053 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 PyErr_NoMemory();
4055 return NULL;
4056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004059 w = _PyUnicode_WSTR(unicode);
4060 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4061 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4063 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004064 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004066 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4067 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 }
4069 else
4070 *w = *four_bytes;
4071
4072 if (w > wchar_end) {
4073 assert(0 && "Miscalculated string end");
4074 }
4075 }
4076 *w = 0;
4077#else
4078 /* sizeof(wchar_t) == 4 */
4079 Py_FatalError("Impossible unicode object state, wstr and str "
4080 "should share memory already.");
4081 return NULL;
4082#endif
4083 }
4084 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004085 if ((size_t)_PyUnicode_LENGTH(unicode) >
4086 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4087 PyErr_NoMemory();
4088 return NULL;
4089 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4091 (_PyUnicode_LENGTH(unicode) + 1));
4092 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 PyErr_NoMemory();
4094 return NULL;
4095 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004096 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4097 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4098 w = _PyUnicode_WSTR(unicode);
4099 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004101 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4102 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 for (; w < wchar_end; ++one_byte, ++w)
4104 *w = *one_byte;
4105 /* null-terminate the wstr */
4106 *w = 0;
4107 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004108 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004110 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 for (; w < wchar_end; ++two_bytes, ++w)
4112 *w = *two_bytes;
4113 /* null-terminate the wstr */
4114 *w = 0;
4115#else
4116 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004117 PyObject_FREE(_PyUnicode_WSTR(unicode));
4118 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 Py_FatalError("Impossible unicode object state, wstr "
4120 "and str should share memory already.");
4121 return NULL;
4122#endif
4123 }
4124 else {
4125 assert(0 && "This should never happen.");
4126 }
4127 }
4128 }
4129 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004130 *size = PyUnicode_WSTR_LENGTH(unicode);
4131 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004132}
4133
Alexander Belopolsky40018472011-02-26 01:02:56 +00004134Py_UNICODE *
4135PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138}
4139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140
Alexander Belopolsky40018472011-02-26 01:02:56 +00004141Py_ssize_t
4142PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143{
4144 if (!PyUnicode_Check(unicode)) {
4145 PyErr_BadArgument();
4146 goto onError;
4147 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004148 if (_PyUnicode_WSTR(unicode) == NULL) {
4149 if (PyUnicode_AsUnicode(unicode) == NULL)
4150 goto onError;
4151 }
4152 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 return -1;
4156}
4157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158Py_ssize_t
4159PyUnicode_GetLength(PyObject *unicode)
4160{
Victor Stinner07621332012-06-16 04:53:46 +02004161 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 PyErr_BadArgument();
4163 return -1;
4164 }
Victor Stinner07621332012-06-16 04:53:46 +02004165 if (PyUnicode_READY(unicode) == -1)
4166 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return PyUnicode_GET_LENGTH(unicode);
4168}
4169
4170Py_UCS4
4171PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4172{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004173 void *data;
4174 int kind;
4175
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004176 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4177 PyErr_BadArgument();
4178 return (Py_UCS4)-1;
4179 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004180 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004181 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return (Py_UCS4)-1;
4183 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004184 data = PyUnicode_DATA(unicode);
4185 kind = PyUnicode_KIND(unicode);
4186 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187}
4188
4189int
4190PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4191{
4192 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004193 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 return -1;
4195 }
Victor Stinner488fa492011-12-12 00:01:39 +01004196 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004197 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 PyErr_SetString(PyExc_IndexError, "string index out of range");
4199 return -1;
4200 }
Victor Stinner488fa492011-12-12 00:01:39 +01004201 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004202 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004203 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204 PyErr_SetString(PyExc_ValueError, "character out of range");
4205 return -1;
4206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208 index, ch);
4209 return 0;
4210}
4211
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212const char *
4213PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004214{
Victor Stinner42cb4622010-09-01 19:39:01 +00004215 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004216}
4217
Victor Stinner554f3f02010-06-16 23:33:54 +00004218/* create or adjust a UnicodeDecodeError */
4219static void
4220make_decode_exception(PyObject **exceptionObject,
4221 const char *encoding,
4222 const char *input, Py_ssize_t length,
4223 Py_ssize_t startpos, Py_ssize_t endpos,
4224 const char *reason)
4225{
4226 if (*exceptionObject == NULL) {
4227 *exceptionObject = PyUnicodeDecodeError_Create(
4228 encoding, input, length, startpos, endpos, reason);
4229 }
4230 else {
4231 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232 goto onError;
4233 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234 goto onError;
4235 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236 goto onError;
4237 }
4238 return;
4239
4240onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004241 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004242}
4243
Steve Dowercc16be82016-09-08 10:35:16 -07004244#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* error handling callback helper:
4246 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004247 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 and adjust various state variables.
4249 return 0 on success, -1 on error
4250*/
4251
Alexander Belopolsky40018472011-02-26 01:02:56 +00004252static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253unicode_decode_call_errorhandler_wchar(
4254 const char *errors, PyObject **errorHandler,
4255 const char *encoding, const char *reason,
4256 const char **input, const char **inend, Py_ssize_t *startinpos,
4257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4258 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004260 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
4262 PyObject *restuple = NULL;
4263 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004264 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004266 Py_ssize_t requiredsize;
4267 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004268 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004269 wchar_t *repwstr;
4270 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004272 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4273 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 *errorHandler = PyCodec_LookupError(errors);
4277 if (*errorHandler == NULL)
4278 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 }
4280
Victor Stinner554f3f02010-06-16 23:33:54 +00004281 make_decode_exception(exceptionObject,
4282 encoding,
4283 *input, *inend - *input,
4284 *startinpos, *endinpos,
4285 reason);
4286 if (*exceptionObject == NULL)
4287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004289 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004293 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004296 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298
4299 /* Copy back the bytes variables, which might have been modified by the
4300 callback */
4301 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4302 if (!inputobj)
4303 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 *input = PyBytes_AS_STRING(inputobj);
4305 insize = PyBytes_GET_SIZE(inputobj);
4306 *inend = *input + insize;
4307 /* we can DECREF safely, as the exception has another reference,
4308 so the object won't go away. */
4309 Py_DECREF(inputobj);
4310
4311 if (newpos<0)
4312 newpos = insize+newpos;
4313 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 goto onError;
4316 }
4317
4318 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4319 if (repwstr == NULL)
4320 goto onError;
4321 /* need more space? (at least enough for what we
4322 have+the replacement+the rest of the string (starting
4323 at the new input position), so we won't have to check space
4324 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004325 requiredsize = *outpos;
4326 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4327 goto overflow;
4328 requiredsize += repwlen;
4329 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4330 goto overflow;
4331 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004333 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 requiredsize = 2*outsize;
4335 if (unicode_resize(output, requiredsize) < 0)
4336 goto onError;
4337 }
4338 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4339 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340 *endinpos = newpos;
4341 *inptr = *input + newpos;
4342
4343 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004344 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 return 0;
4346
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004347 overflow:
4348 PyErr_SetString(PyExc_OverflowError,
4349 "decoded result is too long for a Python string");
4350
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 onError:
4352 Py_XDECREF(restuple);
4353 return -1;
4354}
Steve Dowercc16be82016-09-08 10:35:16 -07004355#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356
4357static int
4358unicode_decode_call_errorhandler_writer(
4359 const char *errors, PyObject **errorHandler,
4360 const char *encoding, const char *reason,
4361 const char **input, const char **inend, Py_ssize_t *startinpos,
4362 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4363 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4364{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004365 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366
4367 PyObject *restuple = NULL;
4368 PyObject *repunicode = NULL;
4369 Py_ssize_t insize;
4370 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004371 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372 PyObject *inputobj = NULL;
4373
4374 if (*errorHandler == NULL) {
4375 *errorHandler = PyCodec_LookupError(errors);
4376 if (*errorHandler == NULL)
4377 goto onError;
4378 }
4379
4380 make_decode_exception(exceptionObject,
4381 encoding,
4382 *input, *inend - *input,
4383 *startinpos, *endinpos,
4384 reason);
4385 if (*exceptionObject == NULL)
4386 goto onError;
4387
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004388 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389 if (restuple == NULL)
4390 goto onError;
4391 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004392 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393 goto onError;
4394 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004395 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004396 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004397
4398 /* Copy back the bytes variables, which might have been modified by the
4399 callback */
4400 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4401 if (!inputobj)
4402 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004403 *input = PyBytes_AS_STRING(inputobj);
4404 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004405 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004406 /* we can DECREF safely, as the exception has another reference,
4407 so the object won't go away. */
4408 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004412 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004413 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416
Victor Stinner170ca6f2013-04-18 00:25:28 +02004417 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004418 if (replen > 1) {
4419 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004420 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004421 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4422 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4423 goto onError;
4424 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004426 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004432 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438}
4439
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004440/* --- UTF-7 Codec -------------------------------------------------------- */
4441
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442/* See RFC2152 for details. We encode conservatively and decode liberally. */
4443
4444/* Three simple macros defining base-64. */
4445
4446/* Is c a base-64 character? */
4447
4448#define IS_BASE64(c) \
4449 (((c) >= 'A' && (c) <= 'Z') || \
4450 ((c) >= 'a' && (c) <= 'z') || \
4451 ((c) >= '0' && (c) <= '9') || \
4452 (c) == '+' || (c) == '/')
4453
4454/* given that c is a base-64 character, what is its base-64 value? */
4455
4456#define FROM_BASE64(c) \
4457 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4458 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4459 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4460 (c) == '+' ? 62 : 63)
4461
4462/* What is the base-64 character of the bottom 6 bits of n? */
4463
4464#define TO_BASE64(n) \
4465 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4466
4467/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4468 * decoded as itself. We are permissive on decoding; the only ASCII
4469 * byte not decoding to itself is the + which begins a base64
4470 * string. */
4471
4472#define DECODE_DIRECT(c) \
4473 ((c) <= 127 && (c) != '+')
4474
4475/* The UTF-7 encoder treats ASCII characters differently according to
4476 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4477 * the above). See RFC2152. This array identifies these different
4478 * sets:
4479 * 0 : "Set D"
4480 * alphanumeric and '(),-./:?
4481 * 1 : "Set O"
4482 * !"#$%&*;<=>@[]^_`{|}
4483 * 2 : "whitespace"
4484 * ht nl cr sp
4485 * 3 : special (must be base64 encoded)
4486 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4487 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Tim Petersced69f82003-09-16 20:30:58 +00004489static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490char utf7_category[128] = {
4491/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4492 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4493/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4494 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4495/* sp ! " # $ % & ' ( ) * + , - . / */
4496 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4497/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4499/* @ A B C D E F G H I J K L M N O */
4500 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4501/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4503/* ` a b c d e f g h i j k l m n o */
4504 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4505/* p q r s t u v w x y z { | } ~ del */
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507};
4508
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509/* ENCODE_DIRECT: this character should be encoded as itself. The
4510 * answer depends on whether we are encoding set O as itself, and also
4511 * on whether we are encoding whitespace as itself. RFC2152 makes it
4512 * clear that the answers to these questions vary between
4513 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004514
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515#define ENCODE_DIRECT(c, directO, directWS) \
4516 ((c) < 128 && (c) > 0 && \
4517 ((utf7_category[(c)] == 0) || \
4518 (directWS && (utf7_category[(c)] == 2)) || \
4519 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520
Alexander Belopolsky40018472011-02-26 01:02:56 +00004521PyObject *
4522PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004523 Py_ssize_t size,
4524 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004526 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4527}
4528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529/* The decoder. The only state we preserve is our read position,
4530 * i.e. how many characters we have consumed. So if we end in the
4531 * middle of a shift sequence we have to back off the read position
4532 * and the output to the beginning of the sequence, otherwise we lose
4533 * all the shift state (seen bits, number of bits seen, high
4534 * surrogate). */
4535
Alexander Belopolsky40018472011-02-26 01:02:56 +00004536PyObject *
4537PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004538 Py_ssize_t size,
4539 const char *errors,
4540 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 Py_ssize_t startinpos;
4544 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004545 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 const char *errmsg = "";
4548 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004549 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 unsigned int base64bits = 0;
4551 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004552 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 PyObject *errorHandler = NULL;
4554 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556 if (size == 0) {
4557 if (consumed)
4558 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004559 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004562 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004563 _PyUnicodeWriter_Init(&writer);
4564 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565
4566 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567 e = s + size;
4568
4569 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004570 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004572 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 if (inShift) { /* in a base-64 section */
4575 if (IS_BASE64(ch)) { /* consume a base-64 character */
4576 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4577 base64bits += 6;
4578 s++;
4579 if (base64bits >= 16) {
4580 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004581 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 base64bits -= 16;
4583 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004584 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (surrogate) {
4586 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004587 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4588 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004589 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004590 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004592 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 }
4594 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004595 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004596 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 }
4599 }
Victor Stinner551ac952011-11-29 22:58:13 +01004600 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 /* first surrogate */
4602 surrogate = outCh;
4603 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 }
4608 }
4609 }
4610 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 if (base64bits > 0) { /* left-over bits */
4613 if (base64bits >= 6) {
4614 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004615 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 errmsg = "partial character in shift sequence";
4617 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 else {
4620 /* Some bits remain; they should be zero */
4621 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004622 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 errmsg = "non-zero padding bits in shift sequence";
4624 goto utf7Error;
4625 }
4626 }
4627 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004628 if (surrogate && DECODE_DIRECT(ch)) {
4629 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4630 goto onError;
4631 }
4632 surrogate = 0;
4633 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 /* '-' is absorbed; other terminating
4635 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
4639 }
4640 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 s++; /* consume '+' */
4643 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004645 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004646 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 }
4648 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004650 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004653 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 }
4655 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004658 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004659 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 else {
4662 startinpos = s-starts;
4663 s++;
4664 errmsg = "unexpected special character";
4665 goto utf7Error;
4666 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 errors, &errorHandler,
4672 "utf7", errmsg,
4673 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676 }
4677
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 /* end of string */
4679
4680 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4681 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004682 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 if (surrogate ||
4684 (base64bits >= 6) ||
4685 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 errors, &errorHandler,
4689 "utf7", "unterminated shift sequence",
4690 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004691 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 goto onError;
4693 if (s < e)
4694 goto restart;
4695 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697
4698 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004699 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004702 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004703 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004704 writer.kind, writer.data, shiftOutStart);
4705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
4707 _PyUnicodeWriter_Dealloc(&writer);
4708 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004709 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004710 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 }
4712 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004713 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 return NULL;
4726}
4727
4728
Alexander Belopolsky40018472011-02-26 01:02:56 +00004729PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004730_PyUnicode_EncodeUTF7(PyObject *str,
4731 int base64SetO,
4732 int base64WhiteSpace,
4733 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004734{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 int kind;
4736 void *data;
4737 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 unsigned int base64bits = 0;
4742 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004743 char * out;
4744 char * start;
4745
Benjamin Petersonbac79492012-01-14 13:34:47 -05004746 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004747 return NULL;
4748 kind = PyUnicode_KIND(str);
4749 data = PyUnicode_DATA(str);
4750 len = PyUnicode_GET_LENGTH(str);
4751
4752 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004756 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004757 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004758 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 if (v == NULL)
4760 return NULL;
4761
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004764 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 if (inShift) {
4767 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4768 /* shifting out */
4769 if (base64bits) { /* output remaining bits */
4770 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4771 base64buffer = 0;
4772 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773 }
4774 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 /* Characters not in the BASE64 set implicitly unshift the sequence
4776 so no '-' is required, except if the character is itself a '-' */
4777 if (IS_BASE64(ch) || ch == '-') {
4778 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004780 *out++ = (char) ch;
4781 }
4782 else {
4783 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004784 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 else { /* not in a shift sequence */
4787 if (ch == '+') {
4788 *out++ = '+';
4789 *out++ = '-';
4790 }
4791 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4792 *out++ = (char) ch;
4793 }
4794 else {
4795 *out++ = '+';
4796 inShift = 1;
4797 goto encode_char;
4798 }
4799 }
4800 continue;
4801encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004803 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004804
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 /* code first surrogate */
4806 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004807 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 while (base64bits >= 6) {
4809 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4810 base64bits -= 6;
4811 }
4812 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004813 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 base64bits += 16;
4816 base64buffer = (base64buffer << 16) | ch;
4817 while (base64bits >= 6) {
4818 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4819 base64bits -= 6;
4820 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004821 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 if (base64bits)
4823 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4824 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004826 if (_PyBytes_Resize(&v, out - start) < 0)
4827 return NULL;
4828 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004830PyObject *
4831PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4832 Py_ssize_t size,
4833 int base64SetO,
4834 int base64WhiteSpace,
4835 const char *errors)
4836{
4837 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004838 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004839 if (tmp == NULL)
4840 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004841 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004842 base64WhiteSpace, errors);
4843 Py_DECREF(tmp);
4844 return result;
4845}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004846
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847#undef IS_BASE64
4848#undef FROM_BASE64
4849#undef TO_BASE64
4850#undef DECODE_DIRECT
4851#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853/* --- UTF-8 Codec -------------------------------------------------------- */
4854
Alexander Belopolsky40018472011-02-26 01:02:56 +00004855PyObject *
4856PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004857 Py_ssize_t size,
4858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Walter Dörwald69652032004-09-07 20:24:22 +00004860 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4861}
4862
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863#include "stringlib/asciilib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004867#include "stringlib/ucs1lib.h"
4868#include "stringlib/codecs.h"
4869#include "stringlib/undef.h"
4870
4871#include "stringlib/ucs2lib.h"
4872#include "stringlib/codecs.h"
4873#include "stringlib/undef.h"
4874
4875#include "stringlib/ucs4lib.h"
4876#include "stringlib/codecs.h"
4877#include "stringlib/undef.h"
4878
Antoine Pitrouab868312009-01-10 15:40:25 +00004879/* Mask to quickly check whether a C 'long' contains a
4880 non-ASCII, UTF8-encoded char. */
4881#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004882# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004883#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004884# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004885#else
4886# error C 'long' size should be either 4 or 8!
4887#endif
4888
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889static Py_ssize_t
4890ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004893 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004895 /*
4896 * Issue #17237: m68k is a bit different from most architectures in
4897 * that objects do not use "natural alignment" - for example, int and
4898 * long are only aligned at 2-byte boundaries. Therefore the assert()
4899 * won't work; also, tests have shown that skipping the "optimised
4900 * version" will even speed up m68k.
4901 */
4902#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004904 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4905 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 /* Fast path, see in STRINGLIB(utf8_decode) for
4907 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004908 /* Help allocation */
4909 const char *_p = p;
4910 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 while (_p < aligned_end) {
4912 unsigned long value = *(const unsigned long *) _p;
4913 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 *((unsigned long *)q) = value;
4916 _p += SIZEOF_LONG;
4917 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004918 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 p = _p;
4920 while (p < end) {
4921 if ((unsigned char)*p & 0x80)
4922 break;
4923 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004928#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 while (p < end) {
4930 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4931 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004932 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004933 /* Help allocation */
4934 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 while (_p < aligned_end) {
4936 unsigned long value = *(unsigned long *) _p;
4937 if (value & ASCII_CHAR_MASK)
4938 break;
4939 _p += SIZEOF_LONG;
4940 }
4941 p = _p;
4942 if (_p == end)
4943 break;
4944 }
4945 if ((unsigned char)*p & 0x80)
4946 break;
4947 ++p;
4948 }
4949 memcpy(dest, start, p - start);
4950 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951}
Antoine Pitrouab868312009-01-10 15:40:25 +00004952
Victor Stinner785938e2011-12-11 20:09:03 +01004953PyObject *
4954PyUnicode_DecodeUTF8Stateful(const char *s,
4955 Py_ssize_t size,
4956 const char *errors,
4957 Py_ssize_t *consumed)
4958{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004959 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004960 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962
4963 Py_ssize_t startinpos;
4964 Py_ssize_t endinpos;
4965 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004966 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004968 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004969
4970 if (size == 0) {
4971 if (consumed)
4972 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004974 }
4975
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4977 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004978 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 *consumed = 1;
4980 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004981 }
4982
Victor Stinner8f674cc2013-04-17 23:02:17 +02004983 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004984 writer.min_length = size;
4985 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004986 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004987
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 writer.pos = ascii_decode(s, end, writer.data);
4989 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 while (s < end) {
4991 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004992 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004993
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 if (PyUnicode_IS_ASCII(writer.buffer))
4996 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 } else {
5002 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 }
5005
5006 switch (ch) {
5007 case 0:
5008 if (s == end || consumed)
5009 goto End;
5010 errmsg = "unexpected end of data";
5011 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005012 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 break;
5014 case 1:
5015 errmsg = "invalid start byte";
5016 startinpos = s - starts;
5017 endinpos = startinpos + 1;
5018 break;
5019 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005020 case 3:
5021 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 errmsg = "invalid continuation byte";
5023 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005024 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 break;
5026 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005027 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 goto onError;
5029 continue;
5030 }
5031
Victor Stinner1d65d912015-10-05 13:43:50 +02005032 if (error_handler == _Py_ERROR_UNKNOWN)
5033 error_handler = get_error_handler(errors);
5034
5035 switch (error_handler) {
5036 case _Py_ERROR_IGNORE:
5037 s += (endinpos - startinpos);
5038 break;
5039
5040 case _Py_ERROR_REPLACE:
5041 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5042 goto onError;
5043 s += (endinpos - startinpos);
5044 break;
5045
5046 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005047 {
5048 Py_ssize_t i;
5049
Victor Stinner1d65d912015-10-05 13:43:50 +02005050 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5051 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005052 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005053 ch = (Py_UCS4)(unsigned char)(starts[i]);
5054 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5055 ch + 0xdc00);
5056 writer.pos++;
5057 }
5058 s += (endinpos - startinpos);
5059 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005060 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005061
5062 default:
5063 if (unicode_decode_call_errorhandler_writer(
5064 errors, &error_handler_obj,
5065 "utf-8", errmsg,
5066 &starts, &end, &startinpos, &endinpos, &exc, &s,
5067 &writer))
5068 goto onError;
5069 }
Victor Stinner785938e2011-12-11 20:09:03 +01005070 }
5071
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 if (consumed)
5074 *consumed = s - starts;
5075
Victor Stinner1d65d912015-10-05 13:43:50 +02005076 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005078 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079
5080onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005081 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005085}
5086
Xavier de Gaye76febd02016-12-15 20:59:58 +01005087#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088
5089/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005090 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005091
5092 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005093 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005094
5095wchar_t*
5096_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5097{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 wchar_t *unicode;
5100 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101
5102 /* Note: size will always be longer than the resulting Unicode
5103 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005104 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005106 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005107 if (!unicode)
5108 return NULL;
5109
5110 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 if (ch > 0xFF) {
5121#if SIZEOF_WCHAR_T == 4
5122 assert(0);
5123#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005124 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 /* compute and append the two surrogates: */
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5127 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5128#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 else {
5131 if (!ch && s == e)
5132 break;
5133 /* surrogateescape */
5134 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5135 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138 return unicode;
5139}
5140
Xavier de Gaye76febd02016-12-15 20:59:58 +01005141#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143/* Primary internal function which creates utf8 encoded bytes objects.
5144
5145 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005146 and allocate exactly as much space needed at the end. Else allocate the
5147 maximum possible needed (4 result bytes per Unicode character), and return
5148 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005149*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005150PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005151_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152{
Victor Stinner6099a032011-12-18 14:22:26 +01005153 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154 void *data;
5155 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157 if (!PyUnicode_Check(unicode)) {
5158 PyErr_BadArgument();
5159 return NULL;
5160 }
5161
5162 if (PyUnicode_READY(unicode) == -1)
5163 return NULL;
5164
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005165 if (PyUnicode_UTF8(unicode))
5166 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5167 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168
5169 kind = PyUnicode_KIND(unicode);
5170 data = PyUnicode_DATA(unicode);
5171 size = PyUnicode_GET_LENGTH(unicode);
5172
Benjamin Petersonead6b532011-12-20 17:23:42 -06005173 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005174 default:
5175 assert(0);
5176 case PyUnicode_1BYTE_KIND:
5177 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5178 assert(!PyUnicode_IS_ASCII(unicode));
5179 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5180 case PyUnicode_2BYTE_KIND:
5181 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5182 case PyUnicode_4BYTE_KIND:
5183 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185}
5186
Alexander Belopolsky40018472011-02-26 01:02:56 +00005187PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5189 Py_ssize_t size,
5190 const char *errors)
5191{
5192 PyObject *v, *unicode;
5193
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005194 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005195 if (unicode == NULL)
5196 return NULL;
5197 v = _PyUnicode_AsUTF8String(unicode, errors);
5198 Py_DECREF(unicode);
5199 return v;
5200}
5201
5202PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206}
5207
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208/* --- UTF-32 Codec ------------------------------------------------------- */
5209
5210PyObject *
5211PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 Py_ssize_t size,
5213 const char *errors,
5214 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215{
5216 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5217}
5218
5219PyObject *
5220PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 Py_ssize_t size,
5222 const char *errors,
5223 int *byteorder,
5224 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 const char *starts = s;
5227 Py_ssize_t startinpos;
5228 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005230 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005231 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005232 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234 PyObject *errorHandler = NULL;
5235 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005236
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237 q = (unsigned char *)s;
5238 e = q + size;
5239
5240 if (byteorder)
5241 bo = *byteorder;
5242
5243 /* Check for BOM marks (U+FEFF) in the input and adjust current
5244 byte order setting accordingly. In native mode, the leading BOM
5245 mark is skipped, in all other modes, it is copied to the output
5246 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005247 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005248 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 if (bom == 0x0000FEFF) {
5250 bo = -1;
5251 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005253 else if (bom == 0xFFFE0000) {
5254 bo = 1;
5255 q += 4;
5256 }
5257 if (byteorder)
5258 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259 }
5260
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 if (q == e) {
5262 if (consumed)
5263 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005264 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 }
5266
Victor Stinnere64322e2012-10-30 23:12:47 +01005267#ifdef WORDS_BIGENDIAN
5268 le = bo < 0;
5269#else
5270 le = bo <= 0;
5271#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005272 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005273
Victor Stinner8f674cc2013-04-17 23:02:17 +02005274 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005275 writer.min_length = (e - q + 3) / 4;
5276 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005277 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 while (1) {
5280 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005281 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005282
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005284 enum PyUnicode_Kind kind = writer.kind;
5285 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005286 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005287 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005288 if (le) {
5289 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005290 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 if (ch > maxch)
5292 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005293 if (kind != PyUnicode_1BYTE_KIND &&
5294 Py_UNICODE_IS_SURROGATE(ch))
5295 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 q += 4;
5298 } while (q <= last);
5299 }
5300 else {
5301 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005302 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (ch > maxch)
5304 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 if (kind != PyUnicode_1BYTE_KIND &&
5306 Py_UNICODE_IS_SURROGATE(ch))
5307 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 q += 4;
5310 } while (q <= last);
5311 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 }
5314
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005316 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005317 startinpos = ((const char *)q) - starts;
5318 endinpos = startinpos + 4;
5319 }
5320 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 startinpos = ((const char *)q) - starts;
5326 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 else {
5329 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005330 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 goto onError;
5332 q += 4;
5333 continue;
5334 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005335 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 startinpos = ((const char *)q) - starts;
5337 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005339
5340 /* The remaining input chars are ignored if the callback
5341 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005342 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005344 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005346 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005348 }
5349
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353 Py_XDECREF(errorHandler);
5354 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005355 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 Py_XDECREF(errorHandler);
5360 Py_XDECREF(exc);
5361 return NULL;
5362}
5363
5364PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005365_PyUnicode_EncodeUTF32(PyObject *str,
5366 const char *errors,
5367 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005369 enum PyUnicode_Kind kind;
5370 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005371 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005372 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005373 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005374#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005375 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005377 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005379 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005381 PyObject *errorHandler = NULL;
5382 PyObject *exc = NULL;
5383 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005384
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 if (!PyUnicode_Check(str)) {
5386 PyErr_BadArgument();
5387 return NULL;
5388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005389 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390 return NULL;
5391 kind = PyUnicode_KIND(str);
5392 data = PyUnicode_DATA(str);
5393 len = PyUnicode_GET_LENGTH(str);
5394
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005396 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005398 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 if (v == NULL)
5400 return NULL;
5401
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 /* output buffer is 4-bytes aligned */
5403 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005404 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005411 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005414 else
5415 encoding = "utf-32";
5416
5417 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420 }
5421
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 pos = 0;
5423 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425
5426 if (kind == PyUnicode_2BYTE_KIND) {
5427 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5428 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 else {
5431 assert(kind == PyUnicode_4BYTE_KIND);
5432 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5433 &out, native_ordering);
5434 }
5435 if (pos == len)
5436 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005437
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 rep = unicode_encode_call_errorhandler(
5439 errors, &errorHandler,
5440 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 if (!rep)
5443 goto error;
5444
5445 if (PyBytes_Check(rep)) {
5446 repsize = PyBytes_GET_SIZE(rep);
5447 if (repsize & 3) {
5448 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005449 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 "surrogates not allowed");
5451 goto error;
5452 }
5453 moreunits = repsize / 4;
5454 }
5455 else {
5456 assert(PyUnicode_Check(rep));
5457 if (PyUnicode_READY(rep) < 0)
5458 goto error;
5459 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5460 if (!PyUnicode_IS_ASCII(rep)) {
5461 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005462 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 "surrogates not allowed");
5464 goto error;
5465 }
5466 }
5467
5468 /* four bytes are reserved for each surrogate */
5469 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005470 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 Py_ssize_t morebytes = 4 * (moreunits - 1);
5472 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5473 /* integer overflow */
5474 PyErr_NoMemory();
5475 goto error;
5476 }
5477 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5478 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005479 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005480 }
5481
5482 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005483 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005484 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005486 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005487 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5488 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 }
5490
5491 Py_CLEAR(rep);
5492 }
5493
5494 /* Cut back to size actually needed. This is necessary for, for example,
5495 encoding of a string containing isolated surrogates and the 'ignore'
5496 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (nsize != PyBytes_GET_SIZE(v))
5499 _PyBytes_Resize(&v, nsize);
5500 Py_XDECREF(errorHandler);
5501 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005502 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005503 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005504 error:
5505 Py_XDECREF(rep);
5506 Py_XDECREF(errorHandler);
5507 Py_XDECREF(exc);
5508 Py_XDECREF(v);
5509 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510}
5511
Alexander Belopolsky40018472011-02-26 01:02:56 +00005512PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005513PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5514 Py_ssize_t size,
5515 const char *errors,
5516 int byteorder)
5517{
5518 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005519 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005520 if (tmp == NULL)
5521 return NULL;
5522 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5523 Py_DECREF(tmp);
5524 return result;
5525}
5526
5527PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005528PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005529{
Victor Stinnerb960b342011-11-20 19:12:52 +01005530 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531}
5532
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533/* --- UTF-16 Codec ------------------------------------------------------- */
5534
Tim Peters772747b2001-08-09 22:21:55 +00005535PyObject *
5536PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 Py_ssize_t size,
5538 const char *errors,
5539 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540{
Walter Dörwald69652032004-09-07 20:24:22 +00005541 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5542}
5543
5544PyObject *
5545PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 Py_ssize_t size,
5547 const char *errors,
5548 int *byteorder,
5549 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005550{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t startinpos;
5553 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005554 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005555 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005556 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005557 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005558 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 PyObject *errorHandler = NULL;
5560 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters772747b2001-08-09 22:21:55 +00005563 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005564 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
5566 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005567 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005569 /* Check for BOM marks (U+FEFF) in the input and adjust current
5570 byte order setting accordingly. In native mode, the leading BOM
5571 mark is skipped, in all other modes, it is copied to the output
5572 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 if (bo == 0 && size >= 2) {
5574 const Py_UCS4 bom = (q[1] << 8) | q[0];
5575 if (bom == 0xFEFF) {
5576 q += 2;
5577 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005579 else if (bom == 0xFFFE) {
5580 q += 2;
5581 bo = 1;
5582 }
5583 if (byteorder)
5584 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 if (q == e) {
5588 if (consumed)
5589 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005590 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005591 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592
Christian Heimes743e0cd2012-10-17 23:52:17 +02005593#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005596#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005598 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005599#endif
Tim Peters772747b2001-08-09 22:21:55 +00005600
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 /* Note: size will always be longer than the resulting Unicode
5602 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005603 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005604 writer.min_length = (e - q + 1) / 2;
5605 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 while (1) {
5609 Py_UCS4 ch = 0;
5610 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005615 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 native_ordering);
5617 else
5618 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 native_ordering);
5621 } else if (kind == PyUnicode_2BYTE_KIND) {
5622 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 native_ordering);
5625 } else {
5626 assert(kind == PyUnicode_4BYTE_KIND);
5627 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005630 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005631 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632
Antoine Pitrou63065d72012-05-15 23:48:04 +02005633 switch (ch)
5634 {
5635 case 0:
5636 /* remaining byte at the end? (size should be even) */
5637 if (q == e || consumed)
5638 goto End;
5639 errmsg = "truncated data";
5640 startinpos = ((const char *)q) - starts;
5641 endinpos = ((const char *)e) - starts;
5642 break;
5643 /* The remaining input chars are ignored if the callback
5644 chooses to skip the input */
5645 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005646 q -= 2;
5647 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005648 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005650 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 endinpos = ((const char *)e) - starts;
5652 break;
5653 case 2:
5654 errmsg = "illegal encoding";
5655 startinpos = ((const char *)q) - 2 - starts;
5656 endinpos = startinpos + 2;
5657 break;
5658 case 3:
5659 errmsg = "illegal UTF-16 surrogate";
5660 startinpos = ((const char *)q) - 4 - starts;
5661 endinpos = startinpos + 2;
5662 break;
5663 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005664 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005665 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 continue;
5667 }
5668
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005669 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005670 errors,
5671 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005672 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005673 &starts,
5674 (const char **)&e,
5675 &startinpos,
5676 &endinpos,
5677 &exc,
5678 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 }
5682
Antoine Pitrou63065d72012-05-15 23:48:04 +02005683End:
Walter Dörwald69652032004-09-07 20:24:22 +00005684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 Py_XDECREF(errorHandler);
5688 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005692 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 Py_XDECREF(errorHandler);
5694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 return NULL;
5696}
5697
Tim Peters772747b2001-08-09 22:21:55 +00005698PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005699_PyUnicode_EncodeUTF16(PyObject *str,
5700 const char *errors,
5701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005703 enum PyUnicode_Kind kind;
5704 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005705 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005706 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005707 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005708 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005709#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005710 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005711#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005712 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005713#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005714 const char *encoding;
5715 Py_ssize_t nsize, pos;
5716 PyObject *errorHandler = NULL;
5717 PyObject *exc = NULL;
5718 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005719
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 if (!PyUnicode_Check(str)) {
5721 PyErr_BadArgument();
5722 return NULL;
5723 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005724 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 return NULL;
5726 kind = PyUnicode_KIND(str);
5727 data = PyUnicode_DATA(str);
5728 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005731 if (kind == PyUnicode_4BYTE_KIND) {
5732 const Py_UCS4 *in = (const Py_UCS4 *)data;
5733 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005734 while (in < end) {
5735 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005737 }
5738 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005739 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005740 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005742 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 nsize = len + pairs + (byteorder == 0);
5744 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005750 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005751 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005753 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 }
5755 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005756 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Tim Peters772747b2001-08-09 22:21:55 +00005758
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 if (kind == PyUnicode_1BYTE_KIND) {
5760 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5761 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005762 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005763
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005766 }
5767 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
5770 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773
5774 pos = 0;
5775 while (pos < len) {
5776 Py_ssize_t repsize, moreunits;
5777
5778 if (kind == PyUnicode_2BYTE_KIND) {
5779 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5780 &out, native_ordering);
5781 }
5782 else {
5783 assert(kind == PyUnicode_4BYTE_KIND);
5784 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5785 &out, native_ordering);
5786 }
5787 if (pos == len)
5788 break;
5789
5790 rep = unicode_encode_call_errorhandler(
5791 errors, &errorHandler,
5792 encoding, "surrogates not allowed",
5793 str, &exc, pos, pos + 1, &pos);
5794 if (!rep)
5795 goto error;
5796
5797 if (PyBytes_Check(rep)) {
5798 repsize = PyBytes_GET_SIZE(rep);
5799 if (repsize & 1) {
5800 raise_encode_exception(&exc, encoding,
5801 str, pos - 1, pos,
5802 "surrogates not allowed");
5803 goto error;
5804 }
5805 moreunits = repsize / 2;
5806 }
5807 else {
5808 assert(PyUnicode_Check(rep));
5809 if (PyUnicode_READY(rep) < 0)
5810 goto error;
5811 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5812 if (!PyUnicode_IS_ASCII(rep)) {
5813 raise_encode_exception(&exc, encoding,
5814 str, pos - 1, pos,
5815 "surrogates not allowed");
5816 goto error;
5817 }
5818 }
5819
5820 /* two bytes are reserved for each surrogate */
5821 if (moreunits > 1) {
5822 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5823 Py_ssize_t morebytes = 2 * (moreunits - 1);
5824 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5825 /* integer overflow */
5826 PyErr_NoMemory();
5827 goto error;
5828 }
5829 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5830 goto error;
5831 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5832 }
5833
5834 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005835 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 out += moreunits;
5837 } else /* rep is unicode */ {
5838 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5839 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5840 &out, native_ordering);
5841 }
5842
5843 Py_CLEAR(rep);
5844 }
5845
5846 /* Cut back to size actually needed. This is necessary for, for example,
5847 encoding of a string containing isolated surrogates and the 'ignore' handler
5848 is used. */
5849 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5850 if (nsize != PyBytes_GET_SIZE(v))
5851 _PyBytes_Resize(&v, nsize);
5852 Py_XDECREF(errorHandler);
5853 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005854 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005855 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005856 error:
5857 Py_XDECREF(rep);
5858 Py_XDECREF(errorHandler);
5859 Py_XDECREF(exc);
5860 Py_XDECREF(v);
5861 return NULL;
5862#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863}
5864
Alexander Belopolsky40018472011-02-26 01:02:56 +00005865PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5867 Py_ssize_t size,
5868 const char *errors,
5869 int byteorder)
5870{
5871 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005872 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873 if (tmp == NULL)
5874 return NULL;
5875 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5876 Py_DECREF(tmp);
5877 return result;
5878}
5879
5880PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
5886/* --- Unicode Escape Codec ----------------------------------------------- */
5887
Fredrik Lundh06d12682001-01-24 07:59:11 +00005888static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005889
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005891_PyUnicode_DecodeUnicodeEscape(const char *s,
5892 Py_ssize_t size,
5893 const char *errors,
5894 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005897 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 PyObject *errorHandler = NULL;
5900 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005901
Eric V. Smith42454af2016-10-31 09:22:08 -04005902 // so we can remember if we've seen an invalid escape char or not
5903 *first_invalid_escape = NULL;
5904
Victor Stinner62ec3312016-09-06 17:04:34 -07005905 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005906 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005907 }
5908 /* Escaped strings will always be longer than the resulting
5909 Unicode string, so we start with size here and then reduce the
5910 length after conversion to the true value.
5911 (but if the error callback returns a long replacement string
5912 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005913 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005914 writer.min_length = size;
5915 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5916 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005917 }
5918
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 end = s + size;
5920 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005921 unsigned char c = (unsigned char) *s++;
5922 Py_UCS4 ch;
5923 int count;
5924 Py_ssize_t startinpos;
5925 Py_ssize_t endinpos;
5926 const char *message;
5927
5928#define WRITE_ASCII_CHAR(ch) \
5929 do { \
5930 assert(ch <= 127); \
5931 assert(writer.pos < writer.size); \
5932 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5933 } while(0)
5934
5935#define WRITE_CHAR(ch) \
5936 do { \
5937 if (ch <= writer.maxchar) { \
5938 assert(writer.pos < writer.size); \
5939 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5940 } \
5941 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5942 goto onError; \
5943 } \
5944 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
5946 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005947 if (c != '\\') {
5948 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 continue;
5950 }
5951
Victor Stinner62ec3312016-09-06 17:04:34 -07005952 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005954 if (s >= end) {
5955 message = "\\ at end of string";
5956 goto error;
5957 }
5958 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959
Victor Stinner62ec3312016-09-06 17:04:34 -07005960 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005961 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 case '\n': continue;
5965 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5966 case '\'': WRITE_ASCII_CHAR('\''); continue;
5967 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5968 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5971 case 't': WRITE_ASCII_CHAR('\t'); continue;
5972 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5973 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005974 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 case '0': case '1': case '2': case '3':
5981 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005982 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005983 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 ch = (ch<<3) + *s++ - '0';
5985 if (s < end && '0' <= *s && *s <= '7') {
5986 ch = (ch<<3) + *s++ - '0';
5987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005989 WRITE_CHAR(ch);
5990 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* hex escapes */
5993 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07005995 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005996 message = "truncated \\xXX escape";
5997 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006002 message = "truncated \\uXXXX escape";
6003 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006006 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006007 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006008 message = "truncated \\UXXXXXXXX escape";
6009 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006011 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 ch <<= 4;
6013 if (c >= '0' && c <= '9') {
6014 ch += c - '0';
6015 }
6016 else if (c >= 'a' && c <= 'f') {
6017 ch += c - ('a' - 10);
6018 }
6019 else if (c >= 'A' && c <= 'F') {
6020 ch += c - ('A' - 10);
6021 }
6022 else {
6023 break;
6024 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006025 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006027 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 }
6029
6030 /* when we get here, ch is a 32-bit unicode character */
6031 if (ch > MAX_UNICODE) {
6032 message = "illegal Unicode character";
6033 goto error;
6034 }
6035
6036 WRITE_CHAR(ch);
6037 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 if (ucnhash_CAPI == NULL) {
6042 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006043 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6044 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 if (ucnhash_CAPI == NULL) {
6046 PyErr_SetString(
6047 PyExc_UnicodeError,
6048 "\\N escapes not supported (can't load unicodedata module)"
6049 );
6050 goto onError;
6051 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006053
6054 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006056 const char *start = ++s;
6057 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 namelen = s - start;
6062 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006063 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 ch = 0xffffffff; /* in case 'getcode' messes up */
6066 if (namelen <= INT_MAX &&
6067 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6068 &ch, 0)) {
6069 assert(ch <= MAX_UNICODE);
6070 WRITE_CHAR(ch);
6071 continue;
6072 }
6073 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 }
6075 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006076 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077
6078 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006079 if (*first_invalid_escape == NULL) {
6080 *first_invalid_escape = s-1; /* Back up one char, since we've
6081 already incremented s. */
6082 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006083 WRITE_ASCII_CHAR('\\');
6084 WRITE_CHAR(c);
6085 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006087
6088 error:
6089 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006091 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006092 errors, &errorHandler,
6093 "unicodeescape", message,
6094 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006095 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006097 }
6098 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6099 goto onError;
6100 }
6101
6102#undef WRITE_ASCII_CHAR
6103#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006105
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006108 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006109
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006111 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return NULL;
6115}
6116
Eric V. Smith42454af2016-10-31 09:22:08 -04006117PyObject *
6118PyUnicode_DecodeUnicodeEscape(const char *s,
6119 Py_ssize_t size,
6120 const char *errors)
6121{
6122 const char *first_invalid_escape;
6123 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6124 &first_invalid_escape);
6125 if (result == NULL)
6126 return NULL;
6127 if (first_invalid_escape != NULL) {
6128 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6129 "invalid escape sequence '\\%c'",
6130 *first_invalid_escape) < 0) {
6131 Py_DECREF(result);
6132 return NULL;
6133 }
6134 }
6135 return result;
6136}
6137
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006138/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006146 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Ezio Melottie7f90372012-10-05 03:33:31 +03006150 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 escape.
6152
Ezio Melottie7f90372012-10-05 03:33:31 +03006153 For UCS1 strings it's '\xxx', 4 bytes per source character.
6154 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6155 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006156 */
6157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (!PyUnicode_Check(unicode)) {
6159 PyErr_BadArgument();
6160 return NULL;
6161 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006164 }
Victor Stinner358af132015-10-12 22:36:57 +02006165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 if (len == 0) {
6168 return PyBytes_FromStringAndSize(NULL, 0);
6169 }
6170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 kind = PyUnicode_KIND(unicode);
6172 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006176 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 return PyErr_NoMemory();
6178 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006179 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 if (repr == NULL) {
6181 return NULL;
6182 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006186 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006187
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 /* U+0000-U+00ff range */
6189 if (ch < 0x100) {
6190 if (ch >= ' ' && ch < 127) {
6191 if (ch != '\\') {
6192 /* Copy printable US ASCII as-is */
6193 *p++ = (char) ch;
6194 }
6195 /* Escape backslashes */
6196 else {
6197 *p++ = '\\';
6198 *p++ = '\\';
6199 }
6200 }
Victor Stinner358af132015-10-12 22:36:57 +02006201
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 /* Map special whitespace to '\t', \n', '\r' */
6203 else if (ch == '\t') {
6204 *p++ = '\\';
6205 *p++ = 't';
6206 }
6207 else if (ch == '\n') {
6208 *p++ = '\\';
6209 *p++ = 'n';
6210 }
6211 else if (ch == '\r') {
6212 *p++ = '\\';
6213 *p++ = 'r';
6214 }
6215
6216 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6217 else {
6218 *p++ = '\\';
6219 *p++ = 'x';
6220 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6221 *p++ = Py_hexdigits[ch & 0x000F];
6222 }
Tim Petersced69f82003-09-16 20:30:58 +00006223 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006224 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 *p++ = '\\';
6227 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006228 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6230 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6231 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6234 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006235
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 /* Make sure that the first two digits are zero */
6237 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006238 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 *p++ = 'U';
6240 *p++ = '0';
6241 *p++ = '0';
6242 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6246 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6247 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 assert(p - PyBytes_AS_STRING(repr) > 0);
6252 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6253 return NULL;
6254 }
6255 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256}
6257
Alexander Belopolsky40018472011-02-26 01:02:56 +00006258PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006259PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6260 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006262 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006263 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 }
6267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268 result = PyUnicode_AsUnicodeEscapeString(tmp);
6269 Py_DECREF(tmp);
6270 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
6273/* --- Raw Unicode Escape Codec ------------------------------------------- */
6274
Alexander Belopolsky40018472011-02-26 01:02:56 +00006275PyObject *
6276PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006277 Py_ssize_t size,
6278 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006280 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 PyObject *errorHandler = NULL;
6284 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006285
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006287 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 /* Escaped strings will always be longer than the resulting
6291 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 length after conversion to the true value. (But decoding error
6293 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006294 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 writer.min_length = size;
6296 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6297 goto onError;
6298 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 end = s + size;
6301 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 unsigned char c = (unsigned char) *s++;
6303 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006304 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 Py_ssize_t startinpos;
6306 Py_ssize_t endinpos;
6307 const char *message;
6308
6309#define WRITE_CHAR(ch) \
6310 do { \
6311 if (ch <= writer.maxchar) { \
6312 assert(writer.pos < writer.size); \
6313 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6314 } \
6315 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6316 goto onError; \
6317 } \
6318 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 if (c != '\\' || s >= end) {
6322 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006325
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 c = (unsigned char) *s++;
6327 if (c == 'u') {
6328 count = 4;
6329 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 else if (c == 'U') {
6332 count = 8;
6333 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006334 }
6335 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 assert(writer.pos < writer.size);
6337 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6338 WRITE_CHAR(c);
6339 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006340 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 startinpos = s - starts - 2;
6342
6343 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6344 for (ch = 0; count && s < end; ++s, --count) {
6345 c = (unsigned char)*s;
6346 ch <<= 4;
6347 if (c >= '0' && c <= '9') {
6348 ch += c - '0';
6349 }
6350 else if (c >= 'a' && c <= 'f') {
6351 ch += c - ('a' - 10);
6352 }
6353 else if (c >= 'A' && c <= 'F') {
6354 ch += c - ('A' - 10);
6355 }
6356 else {
6357 break;
6358 }
6359 }
6360 if (!count) {
6361 if (ch <= MAX_UNICODE) {
6362 WRITE_CHAR(ch);
6363 continue;
6364 }
6365 message = "\\Uxxxxxxxx out of range";
6366 }
6367
6368 endinpos = s-starts;
6369 writer.min_length = end - s + writer.pos;
6370 if (unicode_decode_call_errorhandler_writer(
6371 errors, &errorHandler,
6372 "rawunicodeescape", message,
6373 &starts, &end, &startinpos, &endinpos, &exc, &s,
6374 &writer)) {
6375 goto onError;
6376 }
6377 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6378 goto onError;
6379 }
6380
6381#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 Py_XDECREF(errorHandler);
6384 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006385 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006386
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006388 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393}
6394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 int kind;
6403 void *data;
6404 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 if (!PyUnicode_Check(unicode)) {
6407 PyErr_BadArgument();
6408 return NULL;
6409 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006413 kind = PyUnicode_KIND(unicode);
6414 data = PyUnicode_DATA(unicode);
6415 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 if (kind == PyUnicode_1BYTE_KIND) {
6417 return PyBytes_FromStringAndSize(data, len);
6418 }
Victor Stinner0e368262011-11-10 20:12:49 +01006419
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6421 bytes, and 1 byte characters 4. */
6422 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (len > PY_SSIZE_T_MAX / expandsize) {
6425 return PyErr_NoMemory();
6426 }
6427 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6428 if (repr == NULL) {
6429 return NULL;
6430 }
6431 if (len == 0) {
6432 return repr;
6433 }
6434
6435 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006436 for (pos = 0; pos < len; pos++) {
6437 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006438
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6440 if (ch < 0x100) {
6441 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006442 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6444 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 *p++ = '\\';
6446 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006447 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6449 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6450 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6453 else {
6454 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6455 *p++ = '\\';
6456 *p++ = 'U';
6457 *p++ = '0';
6458 *p++ = '0';
6459 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6464 *p++ = Py_hexdigits[ch & 15];
6465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006467
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 assert(p > PyBytes_AS_STRING(repr));
6469 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6470 return NULL;
6471 }
6472 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473}
6474
Alexander Belopolsky40018472011-02-26 01:02:56 +00006475PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6477 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006480 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006481 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006482 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6484 Py_DECREF(tmp);
6485 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488/* --- Unicode Internal Codec ------------------------------------------- */
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006492 Py_ssize_t size,
6493 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494{
6495 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 Py_ssize_t startinpos;
6497 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006498 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006499 const char *end;
6500 const char *reason;
6501 PyObject *errorHandler = NULL;
6502 PyObject *exc = NULL;
6503
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006504 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006505 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006506 1))
6507 return NULL;
6508
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006509 if (size == 0)
6510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511
Victor Stinner8f674cc2013-04-17 23:02:17 +02006512 _PyUnicodeWriter_Init(&writer);
6513 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6514 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006516 }
6517 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006518
Victor Stinner8f674cc2013-04-17 23:02:17 +02006519 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006521 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006522 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006523 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006524 endinpos = end-starts;
6525 reason = "truncated input";
6526 goto error;
6527 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006528 /* We copy the raw representation one byte at a time because the
6529 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 ((char *) &uch)[0] = s[0];
6531 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006532#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006533 ((char *) &uch)[2] = s[2];
6534 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006536 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006538 /* We have to sanity check the raw data, otherwise doom looms for
6539 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = s - starts + Py_UNICODE_SIZE;
6542 reason = "illegal code point (> 0x10FFFF)";
6543 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006544 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006545#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006546 s += Py_UNICODE_SIZE;
6547#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006548 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006549 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 Py_UNICODE uch2;
6551 ((char *) &uch2)[0] = s[0];
6552 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006553 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 {
Victor Stinner551ac952011-11-29 22:58:13 +01006555 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006557 }
6558 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559#endif
6560
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006561 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006563 continue;
6564
6565 error:
6566 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006567 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006568 errors, &errorHandler,
6569 "unicode_internal", reason,
6570 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006571 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006572 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 }
6574
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575 Py_XDECREF(errorHandler);
6576 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006577 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006580 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006581 Py_XDECREF(errorHandler);
6582 Py_XDECREF(exc);
6583 return NULL;
6584}
6585
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586/* --- Latin-1 Codec ------------------------------------------------------ */
6587
Alexander Belopolsky40018472011-02-26 01:02:56 +00006588PyObject *
6589PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006590 Py_ssize_t size,
6591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006594 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595}
6596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006598static void
6599make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006600 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006601 PyObject *unicode,
6602 Py_ssize_t startpos, Py_ssize_t endpos,
6603 const char *reason)
6604{
6605 if (*exceptionObject == NULL) {
6606 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006608 encoding, unicode, startpos, endpos, reason);
6609 }
6610 else {
6611 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6612 goto onError;
6613 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6614 goto onError;
6615 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6616 goto onError;
6617 return;
6618 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006619 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 }
6621}
6622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006624static void
6625raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006626 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006627 PyObject *unicode,
6628 Py_ssize_t startpos, Py_ssize_t endpos,
6629 const char *reason)
6630{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006631 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006632 encoding, unicode, startpos, endpos, reason);
6633 if (*exceptionObject != NULL)
6634 PyCodec_StrictErrors(*exceptionObject);
6635}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636
6637/* error handling callback helper:
6638 build arguments, call the callback and check the arguments,
6639 put the result into newpos and return the replacement string, which
6640 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static PyObject *
6642unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 PyObject **errorHandler,
6644 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t startpos, Py_ssize_t endpos,
6647 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006649 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 PyObject *restuple;
6652 PyObject *resunicode;
6653
6654 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 }
6659
Benjamin Petersonbac79492012-01-14 13:34:47 -05006660 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 return NULL;
6662 len = PyUnicode_GET_LENGTH(unicode);
6663
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006664 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006665 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006669 restuple = PyObject_CallFunctionObjArgs(
6670 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006674 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 Py_DECREF(restuple);
6676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006678 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 &resunicode, newpos)) {
6680 Py_DECREF(restuple);
6681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006683 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6685 Py_DECREF(restuple);
6686 return NULL;
6687 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 *newpos = len + *newpos;
6690 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006691 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 Py_INCREF(resunicode);
6696 Py_DECREF(restuple);
6697 return resunicode;
6698}
6699
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006701unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006702 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006703 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006705 /* input state */
6706 Py_ssize_t pos=0, size;
6707 int kind;
6708 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 /* pointer into the output */
6710 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006711 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6712 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006713 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006715 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006716 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006717 /* output object */
6718 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719
Benjamin Petersonbac79492012-01-14 13:34:47 -05006720 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006721 return NULL;
6722 size = PyUnicode_GET_LENGTH(unicode);
6723 kind = PyUnicode_KIND(unicode);
6724 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 /* allocate enough for a simple encoding without
6726 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006727 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006728 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006729
6730 _PyBytesWriter_Init(&writer);
6731 str = _PyBytesWriter_Alloc(&writer, size);
6732 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006733 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006741 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006745 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006747 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006748 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006750
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006751 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006753
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006754 /* Only overallocate the buffer if it's not the last write */
6755 writer.overallocate = (collend < size);
6756
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006758 if (error_handler == _Py_ERROR_UNKNOWN)
6759 error_handler = get_error_handler(errors);
6760
6761 switch (error_handler) {
6762 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006763 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006765
6766 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006767 memset(str, '?', collend - collstart);
6768 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006769 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006770 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 break;
Victor Stinner50149202015-09-22 00:26:54 +02006773
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006774 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006775 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006776 writer.min_size -= (collend - collstart);
6777 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006778 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006779 if (str == NULL)
6780 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006781 pos = collend;
6782 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006783
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006784 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006785 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006786 writer.min_size -= (collend - collstart);
6787 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 unicode, collstart, collend);
6789 if (str == NULL)
6790 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 break;
Victor Stinner50149202015-09-22 00:26:54 +02006793
Victor Stinnerc3713e92015-09-29 12:32:13 +02006794 case _Py_ERROR_SURROGATEESCAPE:
6795 for (i = collstart; i < collend; ++i) {
6796 ch = PyUnicode_READ(kind, data, i);
6797 if (ch < 0xdc80 || 0xdcff < ch) {
6798 /* Not a UTF-8b surrogate */
6799 break;
6800 }
6801 *str++ = (char)(ch - 0xdc00);
6802 ++pos;
6803 }
6804 if (i >= collend)
6805 break;
6806 collstart = pos;
6807 assert(collstart != collend);
6808 /* fallback to general error handling */
6809
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006811 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6812 encoding, reason, unicode, &exc,
6813 collstart, collend, &newpos);
6814 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006816
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006817 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006818 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006819
Victor Stinner6bd525b2015-10-09 13:10:05 +02006820 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006821 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006822 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006823 PyBytes_AS_STRING(rep),
6824 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006825 if (str == NULL)
6826 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006827 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 else {
6829 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006830
Victor Stinner6bd525b2015-10-09 13:10:05 +02006831 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006834 if (limit == 256 ?
6835 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6836 !PyUnicode_IS_ASCII(rep))
6837 {
6838 /* Not all characters are smaller than limit */
6839 raise_encode_exception(&exc, encoding, unicode,
6840 collstart, collend, reason);
6841 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006843 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6844 str = _PyBytesWriter_WriteBytes(&writer, str,
6845 PyUnicode_DATA(rep),
6846 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006849 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006850 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006851
6852 /* If overallocation was disabled, ensure that it was the last
6853 write. Otherwise, we missed an optimization */
6854 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006855 }
6856 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006857
Victor Stinner50149202015-09-22 00:26:54 +02006858 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006860 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006861
6862 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006863 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006864 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006865 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006866 Py_XDECREF(exc);
6867 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868}
6869
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006871PyObject *
6872PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006873 Py_ssize_t size,
6874 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006877 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 if (unicode == NULL)
6879 return NULL;
6880 result = unicode_encode_ucs1(unicode, errors, 256);
6881 Py_DECREF(unicode);
6882 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883}
6884
Alexander Belopolsky40018472011-02-26 01:02:56 +00006885PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006886_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
6888 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyErr_BadArgument();
6890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006892 if (PyUnicode_READY(unicode) == -1)
6893 return NULL;
6894 /* Fast path: if it is a one-byte string, construct
6895 bytes object directly. */
6896 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6897 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6898 PyUnicode_GET_LENGTH(unicode));
6899 /* Non-Latin-1 characters present. Defer to above function to
6900 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006902}
6903
6904PyObject*
6905PyUnicode_AsLatin1String(PyObject *unicode)
6906{
6907 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
6910/* --- 7-bit ASCII Codec -------------------------------------------------- */
6911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912PyObject *
6913PyUnicode_DecodeASCII(const char *s,
6914 Py_ssize_t size,
6915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006918 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006919 int kind;
6920 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 Py_ssize_t startinpos;
6922 Py_ssize_t endinpos;
6923 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006925 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006927 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006930 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006931
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006933 if (size == 1 && (unsigned char)s[0] < 128)
6934 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006935
Victor Stinner8f674cc2013-04-17 23:02:17 +02006936 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006937 writer.min_length = size;
6938 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006939 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006942 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006943 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006944 writer.pos = outpos;
6945 if (writer.pos == size)
6946 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006947
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006948 s += writer.pos;
6949 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006951 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006953 PyUnicode_WRITE(kind, data, writer.pos, c);
6954 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006956 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006958
6959 /* byte outsize range 0x00..0x7f: call the error handler */
6960
6961 if (error_handler == _Py_ERROR_UNKNOWN)
6962 error_handler = get_error_handler(errors);
6963
6964 switch (error_handler)
6965 {
6966 case _Py_ERROR_REPLACE:
6967 case _Py_ERROR_SURROGATEESCAPE:
6968 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006969 but we may switch to UCS2 at the first write */
6970 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6971 goto onError;
6972 kind = writer.kind;
6973 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006974
6975 if (error_handler == _Py_ERROR_REPLACE)
6976 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6977 else
6978 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6979 writer.pos++;
6980 ++s;
6981 break;
6982
6983 case _Py_ERROR_IGNORE:
6984 ++s;
6985 break;
6986
6987 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 startinpos = s-starts;
6989 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006990 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 "ascii", "ordinal not in range(128)",
6993 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006994 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006996 kind = writer.kind;
6997 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007000 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007003
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007005 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007006 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 return NULL;
7009}
7010
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007012PyObject *
7013PyUnicode_EncodeASCII(const Py_UNICODE *p,
7014 Py_ssize_t size,
7015 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007017 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007018 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007019 if (unicode == NULL)
7020 return NULL;
7021 result = unicode_encode_ucs1(unicode, errors, 128);
7022 Py_DECREF(unicode);
7023 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024}
7025
Alexander Belopolsky40018472011-02-26 01:02:56 +00007026PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
7029 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 PyErr_BadArgument();
7031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007033 if (PyUnicode_READY(unicode) == -1)
7034 return NULL;
7035 /* Fast path: if it is an ASCII-only string, construct bytes object
7036 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007037 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007038 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7039 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007040 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041}
7042
7043PyObject *
7044PyUnicode_AsASCIIString(PyObject *unicode)
7045{
7046 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047}
7048
Steve Dowercc16be82016-09-08 10:35:16 -07007049#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007050
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007051/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007052
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007053#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007054#define NEED_RETRY
7055#endif
7056
Victor Stinner3a50e702011-10-18 21:21:00 +02007057#ifndef WC_ERR_INVALID_CHARS
7058# define WC_ERR_INVALID_CHARS 0x0080
7059#endif
7060
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007061static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007062code_page_name(UINT code_page, PyObject **obj)
7063{
7064 *obj = NULL;
7065 if (code_page == CP_ACP)
7066 return "mbcs";
7067 if (code_page == CP_UTF7)
7068 return "CP_UTF7";
7069 if (code_page == CP_UTF8)
7070 return "CP_UTF8";
7071
7072 *obj = PyBytes_FromFormat("cp%u", code_page);
7073 if (*obj == NULL)
7074 return NULL;
7075 return PyBytes_AS_STRING(*obj);
7076}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077
Victor Stinner3a50e702011-10-18 21:21:00 +02007078static DWORD
7079decode_code_page_flags(UINT code_page)
7080{
7081 if (code_page == CP_UTF7) {
7082 /* The CP_UTF7 decoder only supports flags=0 */
7083 return 0;
7084 }
7085 else
7086 return MB_ERR_INVALID_CHARS;
7087}
7088
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 * Decode a byte string from a Windows code page into unicode object in strict
7091 * mode.
7092 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007093 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7094 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007096static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007097decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007098 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 const char *in,
7100 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101{
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007103 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105
7106 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 assert(insize > 0);
7108 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7109 if (outsize <= 0)
7110 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111
7112 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007114 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 if (*v == NULL)
7117 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119 }
7120 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007123 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126 }
7127
7128 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7130 if (outsize <= 0)
7131 goto error;
7132 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007133
Victor Stinner3a50e702011-10-18 21:21:00 +02007134error:
7135 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7136 return -2;
7137 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007138 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139}
7140
Victor Stinner3a50e702011-10-18 21:21:00 +02007141/*
7142 * Decode a byte string from a code page into unicode object with an error
7143 * handler.
7144 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007145 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 * UnicodeDecodeError exception and returns -1 on error.
7147 */
7148static int
7149decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007150 PyObject **v,
7151 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007152 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007153{
7154 const char *startin = in;
7155 const char *endin = in + size;
7156 const DWORD flags = decode_code_page_flags(code_page);
7157 /* Ideally, we should get reason from FormatMessage. This is the Windows
7158 2000 English version of the message. */
7159 const char *reason = "No mapping for the Unicode character exists "
7160 "in the target code page.";
7161 /* each step cannot decode more than 1 character, but a character can be
7162 represented as a surrogate pair */
7163 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007164 int insize;
7165 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 PyObject *errorHandler = NULL;
7167 PyObject *exc = NULL;
7168 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007169 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 DWORD err;
7171 int ret = -1;
7172
7173 assert(size > 0);
7174
7175 encoding = code_page_name(code_page, &encoding_obj);
7176 if (encoding == NULL)
7177 return -1;
7178
Victor Stinner7d00cc12014-03-17 23:08:06 +01007179 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7181 UnicodeDecodeError. */
7182 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7183 if (exc != NULL) {
7184 PyCodec_StrictErrors(exc);
7185 Py_CLEAR(exc);
7186 }
7187 goto error;
7188 }
7189
7190 if (*v == NULL) {
7191 /* Create unicode object */
7192 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7193 PyErr_NoMemory();
7194 goto error;
7195 }
Victor Stinnerab595942011-12-17 04:59:06 +01007196 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 if (*v == NULL)
7199 goto error;
7200 startout = PyUnicode_AS_UNICODE(*v);
7201 }
7202 else {
7203 /* Extend unicode object */
7204 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7205 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007209 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 goto error;
7211 startout = PyUnicode_AS_UNICODE(*v) + n;
7212 }
7213
7214 /* Decode the byte string character per character */
7215 out = startout;
7216 while (in < endin)
7217 {
7218 /* Decode a character */
7219 insize = 1;
7220 do
7221 {
7222 outsize = MultiByteToWideChar(code_page, flags,
7223 in, insize,
7224 buffer, Py_ARRAY_LENGTH(buffer));
7225 if (outsize > 0)
7226 break;
7227 err = GetLastError();
7228 if (err != ERROR_NO_UNICODE_TRANSLATION
7229 && err != ERROR_INSUFFICIENT_BUFFER)
7230 {
7231 PyErr_SetFromWindowsErr(0);
7232 goto error;
7233 }
7234 insize++;
7235 }
7236 /* 4=maximum length of a UTF-8 sequence */
7237 while (insize <= 4 && (in + insize) <= endin);
7238
7239 if (outsize <= 0) {
7240 Py_ssize_t startinpos, endinpos, outpos;
7241
Victor Stinner7d00cc12014-03-17 23:08:06 +01007242 /* last character in partial decode? */
7243 if (in + insize >= endin && !final)
7244 break;
7245
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 startinpos = in - startin;
7247 endinpos = startinpos + 1;
7248 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007249 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 errors, &errorHandler,
7251 encoding, reason,
7252 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007253 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 {
7255 goto error;
7256 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007257 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 }
7259 else {
7260 in += insize;
7261 memcpy(out, buffer, outsize * sizeof(wchar_t));
7262 out += outsize;
7263 }
7264 }
7265
7266 /* write a NUL character at the end */
7267 *out = 0;
7268
7269 /* Extend unicode object */
7270 outsize = out - startout;
7271 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007272 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007274 /* (in - startin) <= size and size is an int */
7275 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276
7277error:
7278 Py_XDECREF(encoding_obj);
7279 Py_XDECREF(errorHandler);
7280 Py_XDECREF(exc);
7281 return ret;
7282}
7283
Victor Stinner3a50e702011-10-18 21:21:00 +02007284static PyObject *
7285decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007286 const char *s, Py_ssize_t size,
7287 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288{
Victor Stinner76a31a62011-11-04 00:05:13 +01007289 PyObject *v = NULL;
7290 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 if (code_page < 0) {
7293 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7294 return NULL;
7295 }
7296
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007299
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 do
7301 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 if (size > INT_MAX) {
7304 chunk_size = INT_MAX;
7305 final = 0;
7306 done = 0;
7307 }
7308 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007310 {
7311 chunk_size = (int)size;
7312 final = (consumed == NULL);
7313 done = 1;
7314 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315
Victor Stinner76a31a62011-11-04 00:05:13 +01007316 if (chunk_size == 0 && done) {
7317 if (v != NULL)
7318 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007319 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 converted = decode_code_page_strict(code_page, &v,
7323 s, chunk_size);
7324 if (converted == -2)
7325 converted = decode_code_page_errors(code_page, &v,
7326 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007327 errors, final);
7328 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007329
7330 if (converted < 0) {
7331 Py_XDECREF(v);
7332 return NULL;
7333 }
7334
7335 if (consumed)
7336 *consumed += converted;
7337
7338 s += converted;
7339 size -= converted;
7340 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007341
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007342 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343}
7344
Alexander Belopolsky40018472011-02-26 01:02:56 +00007345PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007346PyUnicode_DecodeCodePageStateful(int code_page,
7347 const char *s,
7348 Py_ssize_t size,
7349 const char *errors,
7350 Py_ssize_t *consumed)
7351{
7352 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7353}
7354
7355PyObject *
7356PyUnicode_DecodeMBCSStateful(const char *s,
7357 Py_ssize_t size,
7358 const char *errors,
7359 Py_ssize_t *consumed)
7360{
7361 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7362}
7363
7364PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007365PyUnicode_DecodeMBCS(const char *s,
7366 Py_ssize_t size,
7367 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007368{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7370}
7371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372static DWORD
7373encode_code_page_flags(UINT code_page, const char *errors)
7374{
7375 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007376 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 }
7378 else if (code_page == CP_UTF7) {
7379 /* CP_UTF7 only supports flags=0 */
7380 return 0;
7381 }
7382 else {
7383 if (errors != NULL && strcmp(errors, "replace") == 0)
7384 return 0;
7385 else
7386 return WC_NO_BEST_FIT_CHARS;
7387 }
7388}
7389
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 * Encode a Unicode string to a Windows code page into a byte string in strict
7392 * mode.
7393 *
7394 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007395 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007396 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007397static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007398encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007399 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401{
Victor Stinner554f3f02010-06-16 23:33:54 +00007402 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 BOOL *pusedDefaultChar = &usedDefaultChar;
7404 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007405 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007406 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 const DWORD flags = encode_code_page_flags(code_page, NULL);
7408 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 /* Create a substring so that we can get the UTF-16 representation
7410 of just the slice under consideration. */
7411 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007414
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007416 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007418 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007419
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 substring = PyUnicode_Substring(unicode, offset, offset+len);
7421 if (substring == NULL)
7422 return -1;
7423 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7424 if (p == NULL) {
7425 Py_DECREF(substring);
7426 return -1;
7427 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007428 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007430 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007432 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 NULL, 0,
7434 NULL, pusedDefaultChar);
7435 if (outsize <= 0)
7436 goto error;
7437 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 if (pusedDefaultChar && *pusedDefaultChar) {
7439 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007442
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 if (*outbytes == NULL) {
7447 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451 }
7452 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 const Py_ssize_t n = PyBytes_Size(*outbytes);
7455 if (outsize > PY_SSIZE_T_MAX - n) {
7456 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7461 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465 }
7466
7467 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007469 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 out, outsize,
7471 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 if (outsize <= 0)
7474 goto error;
7475 if (pusedDefaultChar && *pusedDefaultChar)
7476 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7482 return -2;
7483 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007484 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007485}
7486
Victor Stinner3a50e702011-10-18 21:21:00 +02007487/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007488 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 * error handler.
7490 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007491 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 * -1 on other error.
7493 */
7494static int
7495encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007496 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007497 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007498{
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 Py_ssize_t pos = unicode_offset;
7501 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 /* Ideally, we should get reason from FormatMessage. This is the Windows
7503 2000 English version of the message. */
7504 const char *reason = "invalid character";
7505 /* 4=maximum length of a UTF-8 sequence */
7506 char buffer[4];
7507 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7508 Py_ssize_t outsize;
7509 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 PyObject *errorHandler = NULL;
7511 PyObject *exc = NULL;
7512 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007513 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 PyObject *rep;
7516 int ret = -1;
7517
7518 assert(insize > 0);
7519
7520 encoding = code_page_name(code_page, &encoding_obj);
7521 if (encoding == NULL)
7522 return -1;
7523
7524 if (errors == NULL || strcmp(errors, "strict") == 0) {
7525 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7526 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007527 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 if (exc != NULL) {
7529 PyCodec_StrictErrors(exc);
7530 Py_DECREF(exc);
7531 }
7532 Py_XDECREF(encoding_obj);
7533 return -1;
7534 }
7535
7536 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7537 pusedDefaultChar = &usedDefaultChar;
7538 else
7539 pusedDefaultChar = NULL;
7540
7541 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7542 PyErr_NoMemory();
7543 goto error;
7544 }
7545 outsize = insize * Py_ARRAY_LENGTH(buffer);
7546
7547 if (*outbytes == NULL) {
7548 /* Create string object */
7549 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7550 if (*outbytes == NULL)
7551 goto error;
7552 out = PyBytes_AS_STRING(*outbytes);
7553 }
7554 else {
7555 /* Extend string object */
7556 Py_ssize_t n = PyBytes_Size(*outbytes);
7557 if (n > PY_SSIZE_T_MAX - outsize) {
7558 PyErr_NoMemory();
7559 goto error;
7560 }
7561 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7562 goto error;
7563 out = PyBytes_AS_STRING(*outbytes) + n;
7564 }
7565
7566 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007567 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007569 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7570 wchar_t chars[2];
7571 int charsize;
7572 if (ch < 0x10000) {
7573 chars[0] = (wchar_t)ch;
7574 charsize = 1;
7575 }
7576 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007577 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7578 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007579 charsize = 2;
7580 }
7581
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 buffer, Py_ARRAY_LENGTH(buffer),
7585 NULL, pusedDefaultChar);
7586 if (outsize > 0) {
7587 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7588 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007589 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 memcpy(out, buffer, outsize);
7591 out += outsize;
7592 continue;
7593 }
7594 }
7595 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7596 PyErr_SetFromWindowsErr(0);
7597 goto error;
7598 }
7599
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 rep = unicode_encode_call_errorhandler(
7601 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007602 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007603 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 if (rep == NULL)
7605 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607
7608 if (PyBytes_Check(rep)) {
7609 outsize = PyBytes_GET_SIZE(rep);
7610 if (outsize != 1) {
7611 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7612 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7613 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7614 Py_DECREF(rep);
7615 goto error;
7616 }
7617 out = PyBytes_AS_STRING(*outbytes) + offset;
7618 }
7619 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7620 out += outsize;
7621 }
7622 else {
7623 Py_ssize_t i;
7624 enum PyUnicode_Kind kind;
7625 void *data;
7626
Benjamin Petersonbac79492012-01-14 13:34:47 -05007627 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 Py_DECREF(rep);
7629 goto error;
7630 }
7631
7632 outsize = PyUnicode_GET_LENGTH(rep);
7633 if (outsize != 1) {
7634 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7635 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7636 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7637 Py_DECREF(rep);
7638 goto error;
7639 }
7640 out = PyBytes_AS_STRING(*outbytes) + offset;
7641 }
7642 kind = PyUnicode_KIND(rep);
7643 data = PyUnicode_DATA(rep);
7644 for (i=0; i < outsize; i++) {
7645 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7646 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007647 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 encoding, unicode,
7649 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 "unable to encode error handler result to ASCII");
7651 Py_DECREF(rep);
7652 goto error;
7653 }
7654 *out = (unsigned char)ch;
7655 out++;
7656 }
7657 }
7658 Py_DECREF(rep);
7659 }
7660 /* write a NUL byte */
7661 *out = 0;
7662 outsize = out - PyBytes_AS_STRING(*outbytes);
7663 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7664 if (_PyBytes_Resize(outbytes, outsize) < 0)
7665 goto error;
7666 ret = 0;
7667
7668error:
7669 Py_XDECREF(encoding_obj);
7670 Py_XDECREF(errorHandler);
7671 Py_XDECREF(exc);
7672 return ret;
7673}
7674
Victor Stinner3a50e702011-10-18 21:21:00 +02007675static PyObject *
7676encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007677 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 const char *errors)
7679{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007680 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007682 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007684
Victor Stinner29dacf22015-01-26 16:41:32 +01007685 if (!PyUnicode_Check(unicode)) {
7686 PyErr_BadArgument();
7687 return NULL;
7688 }
7689
Benjamin Petersonbac79492012-01-14 13:34:47 -05007690 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007691 return NULL;
7692 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007693
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 if (code_page < 0) {
7695 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7696 return NULL;
7697 }
7698
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 return PyBytes_FromStringAndSize(NULL, 0);
7701
Victor Stinner7581cef2011-11-03 22:32:33 +01007702 offset = 0;
7703 do
7704 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007705#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007706 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007707 chunks. */
7708 if (len > INT_MAX/2) {
7709 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007710 done = 0;
7711 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007712 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007713#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007715 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007716 done = 1;
7717 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007718
Victor Stinner76a31a62011-11-04 00:05:13 +01007719 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 errors);
7722 if (ret == -2)
7723 ret = encode_code_page_errors(code_page, &outbytes,
7724 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 if (ret < 0) {
7727 Py_XDECREF(outbytes);
7728 return NULL;
7729 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730
Victor Stinner7581cef2011-11-03 22:32:33 +01007731 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 return outbytes;
7736}
7737
7738PyObject *
7739PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7740 Py_ssize_t size,
7741 const char *errors)
7742{
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007744 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 if (unicode == NULL)
7746 return NULL;
7747 res = encode_code_page(CP_ACP, unicode, errors);
7748 Py_DECREF(unicode);
7749 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007750}
7751
7752PyObject *
7753PyUnicode_EncodeCodePage(int code_page,
7754 PyObject *unicode,
7755 const char *errors)
7756{
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007758}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007759
Alexander Belopolsky40018472011-02-26 01:02:56 +00007760PyObject *
7761PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007762{
Victor Stinner7581cef2011-11-03 22:32:33 +01007763 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007764}
7765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007766#undef NEED_RETRY
7767
Steve Dowercc16be82016-09-08 10:35:16 -07007768#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007769
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770/* --- Character Mapping Codec -------------------------------------------- */
7771
Victor Stinnerfb161b12013-04-18 01:44:27 +02007772static int
7773charmap_decode_string(const char *s,
7774 Py_ssize_t size,
7775 PyObject *mapping,
7776 const char *errors,
7777 _PyUnicodeWriter *writer)
7778{
7779 const char *starts = s;
7780 const char *e;
7781 Py_ssize_t startinpos, endinpos;
7782 PyObject *errorHandler = NULL, *exc = NULL;
7783 Py_ssize_t maplen;
7784 enum PyUnicode_Kind mapkind;
7785 void *mapdata;
7786 Py_UCS4 x;
7787 unsigned char ch;
7788
7789 if (PyUnicode_READY(mapping) == -1)
7790 return -1;
7791
7792 maplen = PyUnicode_GET_LENGTH(mapping);
7793 mapdata = PyUnicode_DATA(mapping);
7794 mapkind = PyUnicode_KIND(mapping);
7795
7796 e = s + size;
7797
7798 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7799 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7800 * is disabled in encoding aliases, latin1 is preferred because
7801 * its implementation is faster. */
7802 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7803 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7804 Py_UCS4 maxchar = writer->maxchar;
7805
7806 assert (writer->kind == PyUnicode_1BYTE_KIND);
7807 while (s < e) {
7808 ch = *s;
7809 x = mapdata_ucs1[ch];
7810 if (x > maxchar) {
7811 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7812 goto onError;
7813 maxchar = writer->maxchar;
7814 outdata = (Py_UCS1 *)writer->data;
7815 }
7816 outdata[writer->pos] = x;
7817 writer->pos++;
7818 ++s;
7819 }
7820 return 0;
7821 }
7822
7823 while (s < e) {
7824 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7825 enum PyUnicode_Kind outkind = writer->kind;
7826 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7827 if (outkind == PyUnicode_1BYTE_KIND) {
7828 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7829 Py_UCS4 maxchar = writer->maxchar;
7830 while (s < e) {
7831 ch = *s;
7832 x = mapdata_ucs2[ch];
7833 if (x > maxchar)
7834 goto Error;
7835 outdata[writer->pos] = x;
7836 writer->pos++;
7837 ++s;
7838 }
7839 break;
7840 }
7841 else if (outkind == PyUnicode_2BYTE_KIND) {
7842 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7843 while (s < e) {
7844 ch = *s;
7845 x = mapdata_ucs2[ch];
7846 if (x == 0xFFFE)
7847 goto Error;
7848 outdata[writer->pos] = x;
7849 writer->pos++;
7850 ++s;
7851 }
7852 break;
7853 }
7854 }
7855 ch = *s;
7856
7857 if (ch < maplen)
7858 x = PyUnicode_READ(mapkind, mapdata, ch);
7859 else
7860 x = 0xfffe; /* invalid value */
7861Error:
7862 if (x == 0xfffe)
7863 {
7864 /* undefined mapping */
7865 startinpos = s-starts;
7866 endinpos = startinpos+1;
7867 if (unicode_decode_call_errorhandler_writer(
7868 errors, &errorHandler,
7869 "charmap", "character maps to <undefined>",
7870 &starts, &e, &startinpos, &endinpos, &exc, &s,
7871 writer)) {
7872 goto onError;
7873 }
7874 continue;
7875 }
7876
7877 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7878 goto onError;
7879 ++s;
7880 }
7881 Py_XDECREF(errorHandler);
7882 Py_XDECREF(exc);
7883 return 0;
7884
7885onError:
7886 Py_XDECREF(errorHandler);
7887 Py_XDECREF(exc);
7888 return -1;
7889}
7890
7891static int
7892charmap_decode_mapping(const char *s,
7893 Py_ssize_t size,
7894 PyObject *mapping,
7895 const char *errors,
7896 _PyUnicodeWriter *writer)
7897{
7898 const char *starts = s;
7899 const char *e;
7900 Py_ssize_t startinpos, endinpos;
7901 PyObject *errorHandler = NULL, *exc = NULL;
7902 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007903 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007904
7905 e = s + size;
7906
7907 while (s < e) {
7908 ch = *s;
7909
7910 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7911 key = PyLong_FromLong((long)ch);
7912 if (key == NULL)
7913 goto onError;
7914
7915 item = PyObject_GetItem(mapping, key);
7916 Py_DECREF(key);
7917 if (item == NULL) {
7918 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7919 /* No mapping found means: mapping is undefined. */
7920 PyErr_Clear();
7921 goto Undefined;
7922 } else
7923 goto onError;
7924 }
7925
7926 /* Apply mapping */
7927 if (item == Py_None)
7928 goto Undefined;
7929 if (PyLong_Check(item)) {
7930 long value = PyLong_AS_LONG(item);
7931 if (value == 0xFFFE)
7932 goto Undefined;
7933 if (value < 0 || value > MAX_UNICODE) {
7934 PyErr_Format(PyExc_TypeError,
7935 "character mapping must be in range(0x%lx)",
7936 (unsigned long)MAX_UNICODE + 1);
7937 goto onError;
7938 }
7939
7940 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7941 goto onError;
7942 }
7943 else if (PyUnicode_Check(item)) {
7944 if (PyUnicode_READY(item) == -1)
7945 goto onError;
7946 if (PyUnicode_GET_LENGTH(item) == 1) {
7947 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7951 goto onError;
7952 }
7953 else {
7954 writer->overallocate = 1;
7955 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7956 goto onError;
7957 }
7958 }
7959 else {
7960 /* wrong return value */
7961 PyErr_SetString(PyExc_TypeError,
7962 "character mapping must return integer, None or str");
7963 goto onError;
7964 }
7965 Py_CLEAR(item);
7966 ++s;
7967 continue;
7968
7969Undefined:
7970 /* undefined mapping */
7971 Py_CLEAR(item);
7972 startinpos = s-starts;
7973 endinpos = startinpos+1;
7974 if (unicode_decode_call_errorhandler_writer(
7975 errors, &errorHandler,
7976 "charmap", "character maps to <undefined>",
7977 &starts, &e, &startinpos, &endinpos, &exc, &s,
7978 writer)) {
7979 goto onError;
7980 }
7981 }
7982 Py_XDECREF(errorHandler);
7983 Py_XDECREF(exc);
7984 return 0;
7985
7986onError:
7987 Py_XDECREF(item);
7988 Py_XDECREF(errorHandler);
7989 Py_XDECREF(exc);
7990 return -1;
7991}
7992
Alexander Belopolsky40018472011-02-26 01:02:56 +00007993PyObject *
7994PyUnicode_DecodeCharmap(const char *s,
7995 Py_ssize_t size,
7996 PyObject *mapping,
7997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007999 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 /* Default to Latin-1 */
8002 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008006 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008007 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008008 writer.min_length = size;
8009 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008011
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008012 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008013 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8014 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008015 }
8016 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008017 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8018 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008020 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008021
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008023 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return NULL;
8025}
8026
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027/* Charmap encoding: the lookup table */
8028
Alexander Belopolsky40018472011-02-26 01:02:56 +00008029struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 PyObject_HEAD
8031 unsigned char level1[32];
8032 int count2, count3;
8033 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034};
8035
8036static PyObject*
8037encoding_map_size(PyObject *obj, PyObject* args)
8038{
8039 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042}
8043
8044static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008045 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 PyDoc_STR("Return the size (in bytes) of this object") },
8047 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048};
8049
8050static void
8051encoding_map_dealloc(PyObject* o)
8052{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008053 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054}
8055
8056static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 "EncodingMap", /*tp_name*/
8059 sizeof(struct encoding_map), /*tp_basicsize*/
8060 0, /*tp_itemsize*/
8061 /* methods */
8062 encoding_map_dealloc, /*tp_dealloc*/
8063 0, /*tp_print*/
8064 0, /*tp_getattr*/
8065 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008066 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 0, /*tp_repr*/
8068 0, /*tp_as_number*/
8069 0, /*tp_as_sequence*/
8070 0, /*tp_as_mapping*/
8071 0, /*tp_hash*/
8072 0, /*tp_call*/
8073 0, /*tp_str*/
8074 0, /*tp_getattro*/
8075 0, /*tp_setattro*/
8076 0, /*tp_as_buffer*/
8077 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8078 0, /*tp_doc*/
8079 0, /*tp_traverse*/
8080 0, /*tp_clear*/
8081 0, /*tp_richcompare*/
8082 0, /*tp_weaklistoffset*/
8083 0, /*tp_iter*/
8084 0, /*tp_iternext*/
8085 encoding_map_methods, /*tp_methods*/
8086 0, /*tp_members*/
8087 0, /*tp_getset*/
8088 0, /*tp_base*/
8089 0, /*tp_dict*/
8090 0, /*tp_descr_get*/
8091 0, /*tp_descr_set*/
8092 0, /*tp_dictoffset*/
8093 0, /*tp_init*/
8094 0, /*tp_alloc*/
8095 0, /*tp_new*/
8096 0, /*tp_free*/
8097 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098};
8099
8100PyObject*
8101PyUnicode_BuildEncodingMap(PyObject* string)
8102{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 PyObject *result;
8104 struct encoding_map *mresult;
8105 int i;
8106 int need_dict = 0;
8107 unsigned char level1[32];
8108 unsigned char level2[512];
8109 unsigned char *mlevel1, *mlevel2, *mlevel3;
8110 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 int kind;
8112 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008113 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008116 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 PyErr_BadArgument();
8118 return NULL;
8119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 kind = PyUnicode_KIND(string);
8121 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008122 length = PyUnicode_GET_LENGTH(string);
8123 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 memset(level1, 0xFF, sizeof level1);
8125 memset(level2, 0xFF, sizeof level2);
8126
8127 /* If there isn't a one-to-one mapping of NULL to \0,
8128 or if there are non-BMP characters, we need to use
8129 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 ch = PyUnicode_READ(kind, data, i);
8135 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 need_dict = 1;
8137 break;
8138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 /* unmapped character */
8141 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 l1 = ch >> 11;
8143 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 if (level1[l1] == 0xFF)
8145 level1[l1] = count2++;
8146 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 }
8149
8150 if (count2 >= 0xFF || count3 >= 0xFF)
8151 need_dict = 1;
8152
8153 if (need_dict) {
8154 PyObject *result = PyDict_New();
8155 PyObject *key, *value;
8156 if (!result)
8157 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008158 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008160 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (!key || !value)
8162 goto failed1;
8163 if (PyDict_SetItem(result, key, value) == -1)
8164 goto failed1;
8165 Py_DECREF(key);
8166 Py_DECREF(value);
8167 }
8168 return result;
8169 failed1:
8170 Py_XDECREF(key);
8171 Py_XDECREF(value);
8172 Py_DECREF(result);
8173 return NULL;
8174 }
8175
8176 /* Create a three-level trie */
8177 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8178 16*count2 + 128*count3 - 1);
8179 if (!result)
8180 return PyErr_NoMemory();
8181 PyObject_Init(result, &EncodingMapType);
8182 mresult = (struct encoding_map*)result;
8183 mresult->count2 = count2;
8184 mresult->count3 = count3;
8185 mlevel1 = mresult->level1;
8186 mlevel2 = mresult->level23;
8187 mlevel3 = mresult->level23 + 16*count2;
8188 memcpy(mlevel1, level1, 32);
8189 memset(mlevel2, 0xFF, 16*count2);
8190 memset(mlevel3, 0, 128*count3);
8191 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008192 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008194 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8195 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 /* unmapped character */
8197 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008198 o1 = ch>>11;
8199 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 i2 = 16*mlevel1[o1] + o2;
8201 if (mlevel2[i2] == 0xFF)
8202 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008203 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 i3 = 128*mlevel2[i2] + o3;
8205 mlevel3[i3] = i;
8206 }
8207 return result;
8208}
8209
8210static int
Victor Stinner22168992011-11-20 17:09:18 +01008211encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212{
8213 struct encoding_map *map = (struct encoding_map*)mapping;
8214 int l1 = c>>11;
8215 int l2 = (c>>7) & 0xF;
8216 int l3 = c & 0x7F;
8217 int i;
8218
Victor Stinner22168992011-11-20 17:09:18 +01008219 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 if (c == 0)
8222 return 0;
8223 /* level 1*/
8224 i = map->level1[l1];
8225 if (i == 0xFF) {
8226 return -1;
8227 }
8228 /* level 2*/
8229 i = map->level23[16*i+l2];
8230 if (i == 0xFF) {
8231 return -1;
8232 }
8233 /* level 3 */
8234 i = map->level23[16*map->count2 + 128*i + l3];
8235 if (i == 0) {
8236 return -1;
8237 }
8238 return i;
8239}
8240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241/* Lookup the character ch in the mapping. If the character
8242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008243 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008245charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246{
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 PyObject *x;
8249
8250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 x = PyObject_GetItem(mapping, w);
8253 Py_DECREF(w);
8254 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8256 /* No mapping found means: mapping is undefined. */
8257 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008258 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 } else
8260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008262 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008264 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 long value = PyLong_AS_LONG(x);
8266 if (value < 0 || value > 255) {
8267 PyErr_SetString(PyExc_TypeError,
8268 "character mapping must be in range(256)");
8269 Py_DECREF(x);
8270 return NULL;
8271 }
8272 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008274 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 /* wrong return value */
8278 PyErr_Format(PyExc_TypeError,
8279 "character mapping must return integer, bytes or None, not %.400s",
8280 x->ob_type->tp_name);
8281 Py_DECREF(x);
8282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
8284}
8285
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008287charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8290 /* exponentially overallocate to minimize reallocations */
8291 if (requiredsize < 2*outsize)
8292 requiredsize = 2*outsize;
8293 if (_PyBytes_Resize(outobj, requiredsize))
8294 return -1;
8295 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296}
8297
Benjamin Peterson14339b62009-01-31 16:36:08 +00008298typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008302 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 space is available. Return a new reference to the object that
8304 was put in the output buffer, or Py_None, if the mapping was undefined
8305 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008306 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008307static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008308charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311 PyObject *rep;
8312 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008313 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314
Christian Heimes90aa7642007-12-19 02:45:37 +00008315 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008316 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 if (res == -1)
8319 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 if (outsize<requiredsize)
8321 if (charmapencode_resize(outobj, outpos, requiredsize))
8322 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008323 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 outstart[(*outpos)++] = (char)res;
8325 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 }
8327
8328 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 Py_DECREF(rep);
8333 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 if (PyLong_Check(rep)) {
8336 Py_ssize_t requiredsize = *outpos+1;
8337 if (outsize<requiredsize)
8338 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8339 Py_DECREF(rep);
8340 return enc_EXCEPTION;
8341 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008342 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 else {
8346 const char *repchars = PyBytes_AS_STRING(rep);
8347 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8348 Py_ssize_t requiredsize = *outpos+repsize;
8349 if (outsize<requiredsize)
8350 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8351 Py_DECREF(rep);
8352 return enc_EXCEPTION;
8353 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008354 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 memcpy(outstart + *outpos, repchars, repsize);
8356 *outpos += repsize;
8357 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008359 Py_DECREF(rep);
8360 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361}
8362
8363/* handle an error in PyUnicode_EncodeCharmap
8364 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365static int
8366charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008369 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008370 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371{
8372 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008373 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008375 enum PyUnicode_Kind kind;
8376 void *data;
8377 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008379 Py_ssize_t collstartpos = *inpos;
8380 Py_ssize_t collendpos = *inpos+1;
8381 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 char *encoding = "charmap";
8383 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008386 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387
Benjamin Petersonbac79492012-01-14 13:34:47 -05008388 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 return -1;
8390 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 /* find all unencodable characters */
8392 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008394 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008396 val = encoding_map_lookup(ch, mapping);
8397 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 break;
8399 ++collendpos;
8400 continue;
8401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008403 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8404 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 if (rep==NULL)
8406 return -1;
8407 else if (rep!=Py_None) {
8408 Py_DECREF(rep);
8409 break;
8410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414 /* cache callback name lookup
8415 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008416 if (*error_handler == _Py_ERROR_UNKNOWN)
8417 *error_handler = get_error_handler(errors);
8418
8419 switch (*error_handler) {
8420 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008421 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008423
8424 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 x = charmapencode_output('?', mapping, res, respos);
8427 if (x==enc_EXCEPTION) {
8428 return -1;
8429 }
8430 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008431 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 return -1;
8433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008434 }
8435 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008436 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 *inpos = collendpos;
8438 break;
Victor Stinner50149202015-09-22 00:26:54 +02008439
8440 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 /* generate replacement (temporarily (mis)uses p) */
8442 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 char buffer[2+29+1+1];
8444 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008445 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 for (cp = buffer; *cp; ++cp) {
8447 x = charmapencode_output(*cp, mapping, res, respos);
8448 if (x==enc_EXCEPTION)
8449 return -1;
8450 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008451 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return -1;
8453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 }
8455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 *inpos = collendpos;
8457 break;
Victor Stinner50149202015-09-22 00:26:54 +02008458
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 default:
Victor Stinner50149202015-09-22 00:26:54 +02008460 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008461 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008465 if (PyBytes_Check(repunicode)) {
8466 /* Directly copy bytes result to output. */
8467 Py_ssize_t outsize = PyBytes_Size(*res);
8468 Py_ssize_t requiredsize;
8469 repsize = PyBytes_Size(repunicode);
8470 requiredsize = *respos + repsize;
8471 if (requiredsize > outsize)
8472 /* Make room for all additional bytes. */
8473 if (charmapencode_resize(res, respos, requiredsize)) {
8474 Py_DECREF(repunicode);
8475 return -1;
8476 }
8477 memcpy(PyBytes_AsString(*res) + *respos,
8478 PyBytes_AsString(repunicode), repsize);
8479 *respos += repsize;
8480 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008481 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008482 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008485 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008486 Py_DECREF(repunicode);
8487 return -1;
8488 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008489 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008490 data = PyUnicode_DATA(repunicode);
8491 kind = PyUnicode_KIND(repunicode);
8492 for (index = 0; index < repsize; index++) {
8493 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8494 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008496 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return -1;
8498 }
8499 else if (x==enc_FAILED) {
8500 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008501 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
8503 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 }
8505 *inpos = newpos;
8506 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 }
8508 return 0;
8509}
8510
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008512_PyUnicode_EncodeCharmap(PyObject *unicode,
8513 PyObject *mapping,
8514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 /* output object */
8517 PyObject *res = NULL;
8518 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008519 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008522 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008523 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008525 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008526 void *data;
8527 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528
Benjamin Petersonbac79492012-01-14 13:34:47 -05008529 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530 return NULL;
8531 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008532 data = PyUnicode_DATA(unicode);
8533 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 /* Default to Latin-1 */
8536 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 /* allocate enough for a simple encoding without
8540 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008541 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 if (res == NULL)
8543 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008544 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008548 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008550 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 if (x==enc_EXCEPTION) /* error */
8552 goto onError;
8553 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008556 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 &res, &respos)) {
8558 goto onError;
8559 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 else
8562 /* done with this character => adjust input position */
8563 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008567 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008568 if (_PyBytes_Resize(&res, respos) < 0)
8569 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008572 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 return res;
8574
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 Py_XDECREF(res);
8577 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008578 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 return NULL;
8580}
8581
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582/* Deprecated */
8583PyObject *
8584PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8585 Py_ssize_t size,
8586 PyObject *mapping,
8587 const char *errors)
8588{
8589 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008590 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591 if (unicode == NULL)
8592 return NULL;
8593 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8594 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008595 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008596}
8597
Alexander Belopolsky40018472011-02-26 01:02:56 +00008598PyObject *
8599PyUnicode_AsCharmapString(PyObject *unicode,
8600 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601{
8602 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 PyErr_BadArgument();
8604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607}
8608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008610static void
8611make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008613 Py_ssize_t startpos, Py_ssize_t endpos,
8614 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 *exceptionObject = _PyUnicodeTranslateError_Create(
8618 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 }
8620 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8622 goto onError;
8623 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8624 goto onError;
8625 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8626 goto onError;
8627 return;
8628 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008629 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 }
8631}
8632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633/* error handling callback helper:
8634 build arguments, call the callback and check the arguments,
8635 put the result into newpos and return the replacement string, which
8636 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637static PyObject *
8638unicode_translate_call_errorhandler(const char *errors,
8639 PyObject **errorHandler,
8640 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642 Py_ssize_t startpos, Py_ssize_t endpos,
8643 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008645 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008647 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 PyObject *restuple;
8649 PyObject *resunicode;
8650
8651 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 }
8656
8657 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008662 restuple = PyObject_CallFunctionObjArgs(
8663 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008667 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 Py_DECREF(restuple);
8669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008671 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 &resunicode, &i_newpos)) {
8673 Py_DECREF(restuple);
8674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008676 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008678 else
8679 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008681 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 Py_DECREF(restuple);
8683 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 Py_INCREF(resunicode);
8686 Py_DECREF(restuple);
8687 return resunicode;
8688}
8689
8690/* Lookup the character ch in the mapping and put the result in result,
8691 which must be decrefed by the caller.
8692 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008693static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695{
Christian Heimes217cfd12007-12-02 14:31:20 +00008696 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 PyObject *x;
8698
8699 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 x = PyObject_GetItem(mapping, w);
8702 Py_DECREF(w);
8703 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8705 /* No mapping found means: use 1:1 mapping. */
8706 PyErr_Clear();
8707 *result = NULL;
8708 return 0;
8709 } else
8710 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 }
8712 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 *result = x;
8714 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008716 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008718 if (value < 0 || value > MAX_UNICODE) {
8719 PyErr_Format(PyExc_ValueError,
8720 "character mapping must be in range(0x%x)",
8721 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 Py_DECREF(x);
8723 return -1;
8724 }
8725 *result = x;
8726 return 0;
8727 }
8728 else if (PyUnicode_Check(x)) {
8729 *result = x;
8730 return 0;
8731 }
8732 else {
8733 /* wrong return value */
8734 PyErr_SetString(PyExc_TypeError,
8735 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008736 Py_DECREF(x);
8737 return -1;
8738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739}
Victor Stinner1194ea02014-04-04 19:37:40 +02008740
8741/* lookup the character, write the result into the writer.
8742 Return 1 if the result was written into the writer, return 0 if the mapping
8743 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008745charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8746 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747{
Victor Stinner1194ea02014-04-04 19:37:40 +02008748 PyObject *item;
8749
8750 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008752
8753 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008755 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008758 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008760
8761 if (item == Py_None) {
8762 Py_DECREF(item);
8763 return 0;
8764 }
8765
8766 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008767 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8768 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8769 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008770 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8771 Py_DECREF(item);
8772 return -1;
8773 }
8774 Py_DECREF(item);
8775 return 1;
8776 }
8777
8778 if (!PyUnicode_Check(item)) {
8779 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008781 }
8782
8783 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8784 Py_DECREF(item);
8785 return -1;
8786 }
8787
8788 Py_DECREF(item);
8789 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790}
8791
Victor Stinner89a76ab2014-04-05 11:44:04 +02008792static int
8793unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8794 Py_UCS1 *translate)
8795{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008796 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008797 int ret = 0;
8798
Victor Stinner89a76ab2014-04-05 11:44:04 +02008799 if (charmaptranslate_lookup(ch, mapping, &item)) {
8800 return -1;
8801 }
8802
8803 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008804 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008805 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008806 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008807 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008808 /* not found => default to 1:1 mapping */
8809 translate[ch] = ch;
8810 return 1;
8811 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008812 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008813 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008814 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8815 used it */
8816 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008817 /* invalid character or character outside ASCII:
8818 skip the fast translate */
8819 goto exit;
8820 }
8821 translate[ch] = (Py_UCS1)replace;
8822 }
8823 else if (PyUnicode_Check(item)) {
8824 Py_UCS4 replace;
8825
8826 if (PyUnicode_READY(item) == -1) {
8827 Py_DECREF(item);
8828 return -1;
8829 }
8830 if (PyUnicode_GET_LENGTH(item) != 1)
8831 goto exit;
8832
8833 replace = PyUnicode_READ_CHAR(item, 0);
8834 if (replace > 127)
8835 goto exit;
8836 translate[ch] = (Py_UCS1)replace;
8837 }
8838 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008839 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008840 goto exit;
8841 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842 ret = 1;
8843
Benjamin Peterson1365de72014-04-07 20:15:41 -04008844 exit:
8845 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846 return ret;
8847}
8848
8849/* Fast path for ascii => ascii translation. Return 1 if the whole string
8850 was translated into writer, return 0 if the input string was partially
8851 translated into writer, raise an exception and return -1 on error. */
8852static int
8853unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008854 _PyUnicodeWriter *writer, int ignore,
8855 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856{
Victor Stinner872b2912014-04-05 14:27:07 +02008857 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 Py_ssize_t len;
8859 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008860 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 len = PyUnicode_GET_LENGTH(input);
8863
Victor Stinner872b2912014-04-05 14:27:07 +02008864 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865
8866 in = PyUnicode_1BYTE_DATA(input);
8867 end = in + len;
8868
8869 assert(PyUnicode_IS_ASCII(writer->buffer));
8870 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8871 out = PyUnicode_1BYTE_DATA(writer->buffer);
8872
Victor Stinner872b2912014-04-05 14:27:07 +02008873 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008875 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008877 int translate = unicode_fast_translate_lookup(mapping, ch,
8878 ascii_table);
8879 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008881 if (translate == 0)
8882 goto exit;
8883 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 }
Victor Stinner872b2912014-04-05 14:27:07 +02008885 if (ch2 == 0xfe) {
8886 if (ignore)
8887 continue;
8888 goto exit;
8889 }
8890 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008892 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 }
Victor Stinner872b2912014-04-05 14:27:07 +02008894 res = 1;
8895
8896exit:
8897 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008898 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008899 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900}
8901
Victor Stinner3222da22015-10-01 22:07:32 +02008902static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903_PyUnicode_TranslateCharmap(PyObject *input,
8904 PyObject *mapping,
8905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 Py_ssize_t size, i;
8910 int kind;
8911 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008912 _PyUnicodeWriter writer;
8913 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914 char *reason = "character maps to <undefined>";
8915 PyObject *errorHandler = NULL;
8916 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 PyErr_BadArgument();
8922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 if (PyUnicode_READY(input) == -1)
8926 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 kind = PyUnicode_KIND(input);
8929 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008931 if (size == 0)
8932 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 /* allocate enough for a simple 1:1 translation without
8935 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008936 _PyUnicodeWriter_Init(&writer);
8937 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939
Victor Stinner872b2912014-04-05 14:27:07 +02008940 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8941
Victor Stinner33798672016-03-01 21:59:58 +01008942 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008943 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008944 if (PyUnicode_IS_ASCII(input)) {
8945 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8946 if (res < 0) {
8947 _PyUnicodeWriter_Dealloc(&writer);
8948 return NULL;
8949 }
8950 if (res == 1)
8951 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008952 }
Victor Stinner33798672016-03-01 21:59:58 +01008953 else {
8954 i = 0;
8955 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008959 int translate;
8960 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8961 Py_ssize_t newpos;
8962 /* startpos for collecting untranslatable chars */
8963 Py_ssize_t collstart;
8964 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966
Victor Stinner1194ea02014-04-04 19:37:40 +02008967 ch = PyUnicode_READ(kind, data, i);
8968 translate = charmaptranslate_output(ch, mapping, &writer);
8969 if (translate < 0)
8970 goto onError;
8971
8972 if (translate != 0) {
8973 /* it worked => adjust input pointer */
8974 ++i;
8975 continue;
8976 }
8977
8978 /* untranslatable character */
8979 collstart = i;
8980 collend = i+1;
8981
8982 /* find all untranslatable characters */
8983 while (collend < size) {
8984 PyObject *x;
8985 ch = PyUnicode_READ(kind, data, collend);
8986 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 Py_XDECREF(x);
8989 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 ++collend;
8992 }
8993
8994 if (ignore) {
8995 i = collend;
8996 }
8997 else {
8998 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8999 reason, input, &exc,
9000 collstart, collend, &newpos);
9001 if (repunicode == NULL)
9002 goto onError;
9003 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 Py_DECREF(repunicode);
9008 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 }
9010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009011 Py_XDECREF(exc);
9012 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009016 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017 Py_XDECREF(exc);
9018 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 return NULL;
9020}
9021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022/* Deprecated. Use PyUnicode_Translate instead. */
9023PyObject *
9024PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9025 Py_ssize_t size,
9026 PyObject *mapping,
9027 const char *errors)
9028{
Christian Heimes5f520f42012-09-11 14:03:25 +02009029 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009030 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (!unicode)
9032 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009033 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9034 Py_DECREF(unicode);
9035 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036}
9037
Alexander Belopolsky40018472011-02-26 01:02:56 +00009038PyObject *
9039PyUnicode_Translate(PyObject *str,
9040 PyObject *mapping,
9041 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009043 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009044 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009045 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046}
Tim Petersced69f82003-09-16 20:30:58 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009049fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050{
9051 /* No need to call PyUnicode_READY(self) because this function is only
9052 called as a callback from fixup() which does it already. */
9053 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9054 const int kind = PyUnicode_KIND(self);
9055 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009056 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009057 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 Py_ssize_t i;
9059
9060 for (i = 0; i < len; ++i) {
9061 ch = PyUnicode_READ(kind, data, i);
9062 fixed = 0;
9063 if (ch > 127) {
9064 if (Py_UNICODE_ISSPACE(ch))
9065 fixed = ' ';
9066 else {
9067 const int decimal = Py_UNICODE_TODECIMAL(ch);
9068 if (decimal >= 0)
9069 fixed = '0' + decimal;
9070 }
9071 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009072 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009073 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 PyUnicode_WRITE(kind, data, i, fixed);
9075 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009076 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009077 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 }
9080
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009081 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082}
9083
9084PyObject *
9085_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9086{
9087 if (!PyUnicode_Check(unicode)) {
9088 PyErr_BadInternalCall();
9089 return NULL;
9090 }
9091 if (PyUnicode_READY(unicode) == -1)
9092 return NULL;
9093 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9094 /* If the string is already ASCII, just return the same string */
9095 Py_INCREF(unicode);
9096 return unicode;
9097 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009098 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099}
9100
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009101PyObject *
9102PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9103 Py_ssize_t length)
9104{
Victor Stinnerf0124502011-11-21 23:12:56 +01009105 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009106 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009107 Py_UCS4 maxchar;
9108 enum PyUnicode_Kind kind;
9109 void *data;
9110
Victor Stinner99d7ad02012-02-22 13:37:39 +01009111 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009112 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009113 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114 if (ch > 127) {
9115 int decimal = Py_UNICODE_TODECIMAL(ch);
9116 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009117 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009118 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119 }
9120 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009121
9122 /* Copy to a new string */
9123 decimal = PyUnicode_New(length, maxchar);
9124 if (decimal == NULL)
9125 return decimal;
9126 kind = PyUnicode_KIND(decimal);
9127 data = PyUnicode_DATA(decimal);
9128 /* Iterate over code points */
9129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
9134 ch = '0' + decimal;
9135 }
9136 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009138 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009139}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009140/* --- Decimal Encoder ---------------------------------------------------- */
9141
Alexander Belopolsky40018472011-02-26 01:02:56 +00009142int
9143PyUnicode_EncodeDecimal(Py_UNICODE *s,
9144 Py_ssize_t length,
9145 char *output,
9146 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009147{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009148 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009149 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009150 enum PyUnicode_Kind kind;
9151 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009152
9153 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 PyErr_BadArgument();
9155 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009156 }
9157
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009158 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009159 if (unicode == NULL)
9160 return -1;
9161
Victor Stinner42bf7752011-11-21 22:52:58 +01009162 kind = PyUnicode_KIND(unicode);
9163 data = PyUnicode_DATA(unicode);
9164
Victor Stinnerb84d7232011-11-22 01:50:07 +01009165 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009166 PyObject *exc;
9167 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009169 Py_ssize_t startpos;
9170
9171 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009172
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009175 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 decimal = Py_UNICODE_TODECIMAL(ch);
9179 if (decimal >= 0) {
9180 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009181 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 continue;
9183 }
9184 if (0 < ch && ch < 256) {
9185 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009186 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 continue;
9188 }
Victor Stinner6345be92011-11-25 20:09:01 +01009189
Victor Stinner42bf7752011-11-21 22:52:58 +01009190 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009191 exc = NULL;
9192 raise_encode_exception(&exc, "decimal", unicode,
9193 startpos, startpos+1,
9194 "invalid decimal Unicode string");
9195 Py_XDECREF(exc);
9196 Py_DECREF(unicode);
9197 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198 }
9199 /* 0-terminate the output string */
9200 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009201 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009203}
9204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205/* --- Helpers ------------------------------------------------------------ */
9206
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009207/* helper macro to fixup start/end slice values */
9208#define ADJUST_INDICES(start, end, len) \
9209 if (end > len) \
9210 end = len; \
9211 else if (end < 0) { \
9212 end += len; \
9213 if (end < 0) \
9214 end = 0; \
9215 } \
9216 if (start < 0) { \
9217 start += len; \
9218 if (start < 0) \
9219 start = 0; \
9220 }
9221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009223any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009225 Py_ssize_t end,
9226 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009228 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 void *buf1, *buf2;
9230 Py_ssize_t len1, len2, result;
9231
9232 kind1 = PyUnicode_KIND(s1);
9233 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009234 if (kind1 < kind2)
9235 return -1;
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 len1 = PyUnicode_GET_LENGTH(s1);
9238 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009239 ADJUST_INDICES(start, end, len1);
9240 if (end - start < len2)
9241 return -1;
9242
9243 buf1 = PyUnicode_DATA(s1);
9244 buf2 = PyUnicode_DATA(s2);
9245 if (len2 == 1) {
9246 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9247 result = findchar((const char *)buf1 + kind1*start,
9248 kind1, end - start, ch, direction);
9249 if (result == -1)
9250 return -1;
9251 else
9252 return start + result;
9253 }
9254
9255 if (kind2 != kind1) {
9256 buf2 = _PyUnicode_AsKind(s2, kind1);
9257 if (!buf2)
9258 return -2;
9259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260
Victor Stinner794d5672011-10-10 03:21:36 +02009261 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009262 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009263 case PyUnicode_1BYTE_KIND:
9264 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9265 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9266 else
9267 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9268 break;
9269 case PyUnicode_2BYTE_KIND:
9270 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9271 break;
9272 case PyUnicode_4BYTE_KIND:
9273 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9274 break;
9275 default:
9276 assert(0); result = -2;
9277 }
9278 }
9279 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009281 case PyUnicode_1BYTE_KIND:
9282 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9283 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9284 else
9285 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 default:
9294 assert(0); result = -2;
9295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 }
9297
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 PyMem_Free(buf2);
9300
9301 return result;
9302}
9303
9304Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009305_PyUnicode_InsertThousandsGrouping(
9306 PyObject *unicode, Py_ssize_t index,
9307 Py_ssize_t n_buffer,
9308 void *digits, Py_ssize_t n_digits,
9309 Py_ssize_t min_width,
9310 const char *grouping, PyObject *thousands_sep,
9311 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312{
Victor Stinner41a863c2012-02-24 00:37:51 +01009313 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009314 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009315 Py_ssize_t thousands_sep_len;
9316 Py_ssize_t len;
9317
9318 if (unicode != NULL) {
9319 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009320 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009321 }
9322 else {
9323 kind = PyUnicode_1BYTE_KIND;
9324 data = NULL;
9325 }
9326 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9327 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9328 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9329 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009330 if (thousands_sep_kind < kind) {
9331 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9332 if (!thousands_sep_data)
9333 return -1;
9334 }
9335 else {
9336 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9337 if (!data)
9338 return -1;
9339 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009340 }
9341
Benjamin Petersonead6b532011-12-20 17:23:42 -06009342 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009344 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009345 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009346 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009347 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009348 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009349 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009350 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009351 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009352 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009353 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009354 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009356 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009357 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009359 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 break;
9367 default:
9368 assert(0);
9369 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009371 if (unicode != NULL && thousands_sep_kind != kind) {
9372 if (thousands_sep_kind < kind)
9373 PyMem_Free(thousands_sep_data);
9374 else
9375 PyMem_Free(data);
9376 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 if (unicode == NULL) {
9378 *maxchar = 127;
9379 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009380 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009381 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 }
9383 }
9384 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385}
9386
9387
Alexander Belopolsky40018472011-02-26 01:02:56 +00009388Py_ssize_t
9389PyUnicode_Count(PyObject *str,
9390 PyObject *substr,
9391 Py_ssize_t start,
9392 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009394 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009395 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 void *buf1 = NULL, *buf2 = NULL;
9397 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009398
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009399 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009401
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009402 kind1 = PyUnicode_KIND(str);
9403 kind2 = PyUnicode_KIND(substr);
9404 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009407 len1 = PyUnicode_GET_LENGTH(str);
9408 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009410 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009413 buf1 = PyUnicode_DATA(str);
9414 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009415 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009417 if (!buf2)
9418 goto onError;
9419 }
9420
9421 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009424 result = asciilib_count(
9425 ((Py_UCS1*)buf1) + start, end - start,
9426 buf2, len2, PY_SSIZE_T_MAX
9427 );
9428 else
9429 result = ucs1lib_count(
9430 ((Py_UCS1*)buf1) + start, end - start,
9431 buf2, len2, PY_SSIZE_T_MAX
9432 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 break;
9434 case PyUnicode_2BYTE_KIND:
9435 result = ucs2lib_count(
9436 ((Py_UCS2*)buf1) + start, end - start,
9437 buf2, len2, PY_SSIZE_T_MAX
9438 );
9439 break;
9440 case PyUnicode_4BYTE_KIND:
9441 result = ucs4lib_count(
9442 ((Py_UCS4*)buf1) + start, end - start,
9443 buf2, len2, PY_SSIZE_T_MAX
9444 );
9445 break;
9446 default:
9447 assert(0); result = 0;
9448 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009449
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009450 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 PyMem_Free(buf2);
9452
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009455 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 PyMem_Free(buf2);
9457 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458}
9459
Alexander Belopolsky40018472011-02-26 01:02:56 +00009460Py_ssize_t
9461PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009462 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009463 Py_ssize_t start,
9464 Py_ssize_t end,
9465 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009467 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009469
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009470 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471}
9472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473Py_ssize_t
9474PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9475 Py_ssize_t start, Py_ssize_t end,
9476 int direction)
9477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009479 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 if (PyUnicode_READY(str) == -1)
9481 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009482 len = PyUnicode_GET_LENGTH(str);
9483 ADJUST_INDICES(start, end, len);
9484 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009487 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9488 kind, end-start, ch, direction);
9489 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009491 else
9492 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493}
9494
Alexander Belopolsky40018472011-02-26 01:02:56 +00009495static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009496tailmatch(PyObject *self,
9497 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009498 Py_ssize_t start,
9499 Py_ssize_t end,
9500 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 int kind_self;
9503 int kind_sub;
9504 void *data_self;
9505 void *data_sub;
9506 Py_ssize_t offset;
9507 Py_ssize_t i;
9508 Py_ssize_t end_sub;
9509
9510 if (PyUnicode_READY(self) == -1 ||
9511 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009512 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9515 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009519 if (PyUnicode_GET_LENGTH(substring) == 0)
9520 return 1;
9521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 kind_self = PyUnicode_KIND(self);
9523 data_self = PyUnicode_DATA(self);
9524 kind_sub = PyUnicode_KIND(substring);
9525 data_sub = PyUnicode_DATA(substring);
9526 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9527
9528 if (direction > 0)
9529 offset = end;
9530 else
9531 offset = start;
9532
9533 if (PyUnicode_READ(kind_self, data_self, offset) ==
9534 PyUnicode_READ(kind_sub, data_sub, 0) &&
9535 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9536 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9537 /* If both are of the same kind, memcmp is sufficient */
9538 if (kind_self == kind_sub) {
9539 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009540 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 data_sub,
9542 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009543 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009545 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 else {
9547 /* We do not need to compare 0 and len(substring)-1 because
9548 the if statement above ensured already that they are equal
9549 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 for (i = 1; i < end_sub; ++i) {
9551 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9552 PyUnicode_READ(kind_sub, data_sub, i))
9553 return 0;
9554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 }
9558
9559 return 0;
9560}
9561
Alexander Belopolsky40018472011-02-26 01:02:56 +00009562Py_ssize_t
9563PyUnicode_Tailmatch(PyObject *str,
9564 PyObject *substr,
9565 Py_ssize_t start,
9566 Py_ssize_t end,
9567 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009569 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009571
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009572 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573}
9574
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575/* Apply fixfct filter to the Unicode object self and return a
9576 reference to the modified object */
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009579fixup(PyObject *self,
9580 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 PyObject *u;
9583 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009584 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009586 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009589 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 /* fix functions return the new maximum character in a string,
9592 if the kind of the resulting unicode object does not change,
9593 everything is fine. Otherwise we need to change the string kind
9594 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009595 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009596
9597 if (maxchar_new == 0) {
9598 /* no changes */;
9599 if (PyUnicode_CheckExact(self)) {
9600 Py_DECREF(u);
9601 Py_INCREF(self);
9602 return self;
9603 }
9604 else
9605 return u;
9606 }
9607
Victor Stinnere6abb482012-05-02 01:15:40 +02009608 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609
Victor Stinnereaab6042011-12-11 22:22:39 +01009610 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009612
9613 /* In case the maximum character changed, we need to
9614 convert the string to the new category. */
9615 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9616 if (v == NULL) {
9617 Py_DECREF(u);
9618 return NULL;
9619 }
9620 if (maxchar_new > maxchar_old) {
9621 /* If the maxchar increased so that the kind changed, not all
9622 characters are representable anymore and we need to fix the
9623 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009624 _PyUnicode_FastCopyCharacters(v, 0,
9625 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009626 maxchar_old = fixfct(v);
9627 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 }
9629 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009630 _PyUnicode_FastCopyCharacters(v, 0,
9631 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009633 Py_DECREF(u);
9634 assert(_PyUnicode_CheckConsistency(v, 1));
9635 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636}
9637
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638static PyObject *
9639ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9642 char *resdata, *data = PyUnicode_DATA(self);
9643 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009644
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 res = PyUnicode_New(len, 127);
9646 if (res == NULL)
9647 return NULL;
9648 resdata = PyUnicode_DATA(res);
9649 if (lower)
9650 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 _Py_bytes_upper(resdata, data, len);
9653 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654}
9655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 Py_ssize_t j;
9660 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009661 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009663
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9665
9666 where ! is a negation and \p{xxx} is a character with property xxx.
9667 */
9668 for (j = i - 1; j >= 0; j--) {
9669 c = PyUnicode_READ(kind, data, j);
9670 if (!_PyUnicode_IsCaseIgnorable(c))
9671 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9674 if (final_sigma) {
9675 for (j = i + 1; j < length; j++) {
9676 c = PyUnicode_READ(kind, data, j);
9677 if (!_PyUnicode_IsCaseIgnorable(c))
9678 break;
9679 }
9680 final_sigma = j == length || !_PyUnicode_IsCased(c);
9681 }
9682 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683}
9684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685static int
9686lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9687 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 /* Obscure special case. */
9690 if (c == 0x3A3) {
9691 mapped[0] = handle_capital_sigma(kind, data, length, i);
9692 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695}
9696
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697static Py_ssize_t
9698do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 Py_ssize_t i, k = 0;
9701 int n_res, j;
9702 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009703
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 c = PyUnicode_READ(kind, data, 0);
9705 n_res = _PyUnicode_ToUpperFull(c, mapped);
9706 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009707 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 for (i = 1; i < length; i++) {
9711 c = PyUnicode_READ(kind, data, i);
9712 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9713 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009714 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009716 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009717 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719}
9720
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721static Py_ssize_t
9722do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9723 Py_ssize_t i, k = 0;
9724
9725 for (i = 0; i < length; i++) {
9726 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9727 int n_res, j;
9728 if (Py_UNICODE_ISUPPER(c)) {
9729 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9730 }
9731 else if (Py_UNICODE_ISLOWER(c)) {
9732 n_res = _PyUnicode_ToUpperFull(c, mapped);
9733 }
9734 else {
9735 n_res = 1;
9736 mapped[0] = c;
9737 }
9738 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009739 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740 res[k++] = mapped[j];
9741 }
9742 }
9743 return k;
9744}
9745
9746static Py_ssize_t
9747do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9748 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009750 Py_ssize_t i, k = 0;
9751
9752 for (i = 0; i < length; i++) {
9753 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9754 int n_res, j;
9755 if (lower)
9756 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9757 else
9758 n_res = _PyUnicode_ToUpperFull(c, mapped);
9759 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009760 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res[k++] = mapped[j];
9762 }
9763 }
9764 return k;
9765}
9766
9767static Py_ssize_t
9768do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9769{
9770 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9771}
9772
9773static Py_ssize_t
9774do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9775{
9776 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9777}
9778
Benjamin Petersone51757f2012-01-12 21:10:29 -05009779static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009780do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9781{
9782 Py_ssize_t i, k = 0;
9783
9784 for (i = 0; i < length; i++) {
9785 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9786 Py_UCS4 mapped[3];
9787 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9788 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009789 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009790 res[k++] = mapped[j];
9791 }
9792 }
9793 return k;
9794}
9795
9796static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009797do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799 Py_ssize_t i, k = 0;
9800 int previous_is_cased;
9801
9802 previous_is_cased = 0;
9803 for (i = 0; i < length; i++) {
9804 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9805 Py_UCS4 mapped[3];
9806 int n_res, j;
9807
9808 if (previous_is_cased)
9809 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9810 else
9811 n_res = _PyUnicode_ToTitleFull(c, mapped);
9812
9813 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009814 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009815 res[k++] = mapped[j];
9816 }
9817
9818 previous_is_cased = _PyUnicode_IsCased(c);
9819 }
9820 return k;
9821}
9822
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823static PyObject *
9824case_operation(PyObject *self,
9825 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9826{
9827 PyObject *res = NULL;
9828 Py_ssize_t length, newlength = 0;
9829 int kind, outkind;
9830 void *data, *outdata;
9831 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9832
Benjamin Petersoneea48462012-01-16 14:28:50 -05009833 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834
9835 kind = PyUnicode_KIND(self);
9836 data = PyUnicode_DATA(self);
9837 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009838 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009839 PyErr_SetString(PyExc_OverflowError, "string is too long");
9840 return NULL;
9841 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009842 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009843 if (tmp == NULL)
9844 return PyErr_NoMemory();
9845 newlength = perform(kind, data, length, tmp, &maxchar);
9846 res = PyUnicode_New(newlength, maxchar);
9847 if (res == NULL)
9848 goto leave;
9849 tmpend = tmp + newlength;
9850 outdata = PyUnicode_DATA(res);
9851 outkind = PyUnicode_KIND(res);
9852 switch (outkind) {
9853 case PyUnicode_1BYTE_KIND:
9854 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9855 break;
9856 case PyUnicode_2BYTE_KIND:
9857 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9858 break;
9859 case PyUnicode_4BYTE_KIND:
9860 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9861 break;
9862 default:
9863 assert(0);
9864 break;
9865 }
9866 leave:
9867 PyMem_FREE(tmp);
9868 return res;
9869}
9870
Tim Peters8ce9f162004-08-27 01:49:32 +00009871PyObject *
9872PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009874 PyObject *res;
9875 PyObject *fseq;
9876 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009877 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009879 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009880 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009882 }
9883
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009884 /* NOTE: the following code can't call back into Python code,
9885 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009886 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009887
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009888 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009889 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009890 res = _PyUnicode_JoinArray(separator, items, seqlen);
9891 Py_DECREF(fseq);
9892 return res;
9893}
9894
9895PyObject *
9896_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9897{
9898 PyObject *res = NULL; /* the result */
9899 PyObject *sep = NULL;
9900 Py_ssize_t seplen;
9901 PyObject *item;
9902 Py_ssize_t sz, i, res_offset;
9903 Py_UCS4 maxchar;
9904 Py_UCS4 item_maxchar;
9905 int use_memcpy;
9906 unsigned char *res_data = NULL, *sep_data = NULL;
9907 PyObject *last_obj;
9908 unsigned int kind = 0;
9909
Tim Peters05eba1f2004-08-27 21:32:02 +00009910 /* If empty sequence, return u"". */
9911 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009912 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009913 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009914
Tim Peters05eba1f2004-08-27 21:32:02 +00009915 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009916 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009917 if (seqlen == 1) {
9918 if (PyUnicode_CheckExact(items[0])) {
9919 res = items[0];
9920 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009921 return res;
9922 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009923 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009924 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009925 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009926 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009927 /* Set up sep and seplen */
9928 if (separator == NULL) {
9929 /* fall back to a blank space separator */
9930 sep = PyUnicode_FromOrdinal(' ');
9931 if (!sep)
9932 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009934 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009935 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009936 else {
9937 if (!PyUnicode_Check(separator)) {
9938 PyErr_Format(PyExc_TypeError,
9939 "separator: expected str instance,"
9940 " %.80s found",
9941 Py_TYPE(separator)->tp_name);
9942 goto onError;
9943 }
9944 if (PyUnicode_READY(separator))
9945 goto onError;
9946 sep = separator;
9947 seplen = PyUnicode_GET_LENGTH(separator);
9948 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9949 /* inc refcount to keep this code path symmetric with the
9950 above case of a blank separator */
9951 Py_INCREF(sep);
9952 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009953 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009954 }
9955
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 /* There are at least two things to join, or else we have a subclass
9957 * of str in the sequence.
9958 * Do a pre-pass to figure out the total amount of space we'll
9959 * need (sz), and see whether all argument are strings.
9960 */
9961 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009962#ifdef Py_DEBUG
9963 use_memcpy = 0;
9964#else
9965 use_memcpy = 1;
9966#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009967 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009968 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009970 if (!PyUnicode_Check(item)) {
9971 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009972 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 " %.80s found",
9974 i, Py_TYPE(item)->tp_name);
9975 goto onError;
9976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 if (PyUnicode_READY(item) == -1)
9978 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009979 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009981 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009982 if (i != 0) {
9983 add_sz += seplen;
9984 }
9985 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 goto onError;
9989 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009990 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 if (use_memcpy && last_obj != NULL) {
9992 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9993 use_memcpy = 0;
9994 }
9995 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 }
Tim Petersced69f82003-09-16 20:30:58 +00009997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 if (res == NULL)
10000 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010001
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010003#ifdef Py_DEBUG
10004 use_memcpy = 0;
10005#else
10006 if (use_memcpy) {
10007 res_data = PyUnicode_1BYTE_DATA(res);
10008 kind = PyUnicode_KIND(res);
10009 if (seplen != 0)
10010 sep_data = PyUnicode_1BYTE_DATA(sep);
10011 }
10012#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010013 if (use_memcpy) {
10014 for (i = 0; i < seqlen; ++i) {
10015 Py_ssize_t itemlen;
10016 item = items[i];
10017
10018 /* Copy item, and maybe the separator. */
10019 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010020 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010021 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010022 kind * seplen);
10023 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010024 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010025
10026 itemlen = PyUnicode_GET_LENGTH(item);
10027 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010028 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010029 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010030 kind * itemlen);
10031 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010032 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010033 }
10034 assert(res_data == PyUnicode_1BYTE_DATA(res)
10035 + kind * PyUnicode_GET_LENGTH(res));
10036 }
10037 else {
10038 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10039 Py_ssize_t itemlen;
10040 item = items[i];
10041
10042 /* Copy item, and maybe the separator. */
10043 if (i && seplen != 0) {
10044 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10045 res_offset += seplen;
10046 }
10047
10048 itemlen = PyUnicode_GET_LENGTH(item);
10049 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010050 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010051 res_offset += itemlen;
10052 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010053 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010054 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010055 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010058 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010063 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064 return NULL;
10065}
10066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067#define FILL(kind, data, value, start, length) \
10068 do { \
10069 Py_ssize_t i_ = 0; \
10070 assert(kind != PyUnicode_WCHAR_KIND); \
10071 switch ((kind)) { \
10072 case PyUnicode_1BYTE_KIND: { \
10073 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010074 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 break; \
10076 } \
10077 case PyUnicode_2BYTE_KIND: { \
10078 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10079 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10080 break; \
10081 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010082 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10084 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10085 break; \
10086 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010087 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 } \
10089 } while (0)
10090
Victor Stinnerd3f08822012-05-29 12:57:52 +020010091void
10092_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10093 Py_UCS4 fill_char)
10094{
10095 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10096 const void *data = PyUnicode_DATA(unicode);
10097 assert(PyUnicode_IS_READY(unicode));
10098 assert(unicode_modifiable(unicode));
10099 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10100 assert(start >= 0);
10101 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10102 FILL(kind, data, fill_char, start, length);
10103}
10104
Victor Stinner3fe55312012-01-04 00:33:50 +010010105Py_ssize_t
10106PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10107 Py_UCS4 fill_char)
10108{
10109 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010110
10111 if (!PyUnicode_Check(unicode)) {
10112 PyErr_BadInternalCall();
10113 return -1;
10114 }
10115 if (PyUnicode_READY(unicode) == -1)
10116 return -1;
10117 if (unicode_check_modifiable(unicode))
10118 return -1;
10119
Victor Stinnerd3f08822012-05-29 12:57:52 +020010120 if (start < 0) {
10121 PyErr_SetString(PyExc_IndexError, "string index out of range");
10122 return -1;
10123 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010124 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10125 PyErr_SetString(PyExc_ValueError,
10126 "fill character is bigger than "
10127 "the string maximum character");
10128 return -1;
10129 }
10130
10131 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10132 length = Py_MIN(maxlen, length);
10133 if (length <= 0)
10134 return 0;
10135
Victor Stinnerd3f08822012-05-29 12:57:52 +020010136 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010137 return length;
10138}
10139
Victor Stinner9310abb2011-10-05 00:59:23 +020010140static PyObject *
10141pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010142 Py_ssize_t left,
10143 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 PyObject *u;
10147 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010148 int kind;
10149 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
10151 if (left < 0)
10152 left = 0;
10153 if (right < 0)
10154 right = 0;
10155
Victor Stinnerc4b49542011-12-11 22:44:26 +010010156 if (left == 0 && right == 0)
10157 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10160 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010161 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10162 return NULL;
10163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010165 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010167 if (!u)
10168 return NULL;
10169
10170 kind = PyUnicode_KIND(u);
10171 data = PyUnicode_DATA(u);
10172 if (left)
10173 FILL(kind, data, fill, 0, left);
10174 if (right)
10175 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010176 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010177 assert(_PyUnicode_CheckConsistency(u, 1));
10178 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179}
10180
Alexander Belopolsky40018472011-02-26 01:02:56 +000010181PyObject *
10182PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010186 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188
Benjamin Petersonead6b532011-12-20 17:23:42 -060010189 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191 if (PyUnicode_IS_ASCII(string))
10192 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010194 PyUnicode_GET_LENGTH(string), keepends);
10195 else
10196 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 break;
10200 case PyUnicode_2BYTE_KIND:
10201 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_GET_LENGTH(string), keepends);
10204 break;
10205 case PyUnicode_4BYTE_KIND:
10206 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 PyUnicode_GET_LENGTH(string), keepends);
10209 break;
10210 default:
10211 assert(0);
10212 list = 0;
10213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215}
10216
Alexander Belopolsky40018472011-02-26 01:02:56 +000010217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010218split(PyObject *self,
10219 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010220 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010222 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 void *buf1, *buf2;
10224 Py_ssize_t len1, len2;
10225 PyObject* out;
10226
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010228 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (PyUnicode_READY(self) == -1)
10231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010234 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 if (PyUnicode_IS_ASCII(self))
10237 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010239 PyUnicode_GET_LENGTH(self), maxcount
10240 );
10241 else
10242 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010244 PyUnicode_GET_LENGTH(self), maxcount
10245 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 case PyUnicode_2BYTE_KIND:
10247 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 PyUnicode_GET_LENGTH(self), maxcount
10250 );
10251 case PyUnicode_4BYTE_KIND:
10252 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
10256 default:
10257 assert(0);
10258 return NULL;
10259 }
10260
10261 if (PyUnicode_READY(substring) == -1)
10262 return NULL;
10263
10264 kind1 = PyUnicode_KIND(self);
10265 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 len1 = PyUnicode_GET_LENGTH(self);
10267 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010268 if (kind1 < kind2 || len1 < len2) {
10269 out = PyList_New(1);
10270 if (out == NULL)
10271 return NULL;
10272 Py_INCREF(self);
10273 PyList_SET_ITEM(out, 0, self);
10274 return out;
10275 }
10276 buf1 = PyUnicode_DATA(self);
10277 buf2 = PyUnicode_DATA(substring);
10278 if (kind2 != kind1) {
10279 buf2 = _PyUnicode_AsKind(substring, kind1);
10280 if (!buf2)
10281 return NULL;
10282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010284 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010286 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10287 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 else
10290 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 case PyUnicode_2BYTE_KIND:
10294 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 break;
10297 case PyUnicode_4BYTE_KIND:
10298 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 break;
10301 default:
10302 out = NULL;
10303 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyMem_Free(buf2);
10306 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010310rsplit(PyObject *self,
10311 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010312 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 void *buf1, *buf2;
10316 Py_ssize_t len1, len2;
10317 PyObject* out;
10318
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010319 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010320 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010326 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 if (PyUnicode_IS_ASCII(self))
10329 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
10333 else
10334 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 case PyUnicode_2BYTE_KIND:
10339 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 case PyUnicode_4BYTE_KIND:
10344 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
10348 default:
10349 assert(0);
10350 return NULL;
10351 }
10352
10353 if (PyUnicode_READY(substring) == -1)
10354 return NULL;
10355
10356 kind1 = PyUnicode_KIND(self);
10357 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 len1 = PyUnicode_GET_LENGTH(self);
10359 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010360 if (kind1 < kind2 || len1 < len2) {
10361 out = PyList_New(1);
10362 if (out == NULL)
10363 return NULL;
10364 Py_INCREF(self);
10365 PyList_SET_ITEM(out, 0, self);
10366 return out;
10367 }
10368 buf1 = PyUnicode_DATA(self);
10369 buf2 = PyUnicode_DATA(substring);
10370 if (kind2 != kind1) {
10371 buf2 = _PyUnicode_AsKind(substring, kind1);
10372 if (!buf2)
10373 return NULL;
10374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010376 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010378 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10379 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381 else
10382 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 break;
10385 case PyUnicode_2BYTE_KIND:
10386 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010387 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 break;
10389 case PyUnicode_4BYTE_KIND:
10390 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 break;
10393 default:
10394 out = NULL;
10395 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010396 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 PyMem_Free(buf2);
10398 return out;
10399}
10400
10401static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10403 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010405 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010407 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10408 return asciilib_find(buf1, len1, buf2, len2, offset);
10409 else
10410 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 case PyUnicode_2BYTE_KIND:
10412 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10413 case PyUnicode_4BYTE_KIND:
10414 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10415 }
10416 assert(0);
10417 return -1;
10418}
10419
10420static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10422 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010424 switch (kind) {
10425 case PyUnicode_1BYTE_KIND:
10426 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10427 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10428 else
10429 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10430 case PyUnicode_2BYTE_KIND:
10431 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10434 }
10435 assert(0);
10436 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010437}
10438
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010439static void
10440replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10441 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10442{
10443 int kind = PyUnicode_KIND(u);
10444 void *data = PyUnicode_DATA(u);
10445 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10446 if (kind == PyUnicode_1BYTE_KIND) {
10447 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10448 (Py_UCS1 *)data + len,
10449 u1, u2, maxcount);
10450 }
10451 else if (kind == PyUnicode_2BYTE_KIND) {
10452 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10453 (Py_UCS2 *)data + len,
10454 u1, u2, maxcount);
10455 }
10456 else {
10457 assert(kind == PyUnicode_4BYTE_KIND);
10458 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10459 (Py_UCS4 *)data + len,
10460 u1, u2, maxcount);
10461 }
10462}
10463
Alexander Belopolsky40018472011-02-26 01:02:56 +000010464static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465replace(PyObject *self, PyObject *str1,
10466 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 PyObject *u;
10469 char *sbuf = PyUnicode_DATA(self);
10470 char *buf1 = PyUnicode_DATA(str1);
10471 char *buf2 = PyUnicode_DATA(str2);
10472 int srelease = 0, release1 = 0, release2 = 0;
10473 int skind = PyUnicode_KIND(self);
10474 int kind1 = PyUnicode_KIND(str1);
10475 int kind2 = PyUnicode_KIND(str2);
10476 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10477 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10478 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010480 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481
10482 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010483 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010485 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486
Victor Stinner59de0ee2011-10-07 10:01:28 +020010487 if (str1 == str2)
10488 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489
Victor Stinner49a0a212011-10-12 23:46:10 +020010490 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010491 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10492 if (maxchar < maxchar_str1)
10493 /* substring too wide to be present */
10494 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010495 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10496 /* Replacing str1 with str2 may cause a maxchar reduction in the
10497 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010498 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010499 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010504 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010506 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010509
Victor Stinner69ed0f42013-04-09 21:48:24 +020010510 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010511 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010512 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010514 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010518
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010519 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10520 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010521 }
10522 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 int rkind = skind;
10524 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010525 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (kind1 < rkind) {
10528 /* widen substring */
10529 buf1 = _PyUnicode_AsKind(str1, rkind);
10530 if (!buf1) goto error;
10531 release1 = 1;
10532 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010533 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 if (i < 0)
10535 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (rkind > kind2) {
10537 /* widen replacement */
10538 buf2 = _PyUnicode_AsKind(str2, rkind);
10539 if (!buf2) goto error;
10540 release2 = 1;
10541 }
10542 else if (rkind < kind2) {
10543 /* widen self and buf1 */
10544 rkind = kind2;
10545 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010546 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 sbuf = _PyUnicode_AsKind(self, rkind);
10548 if (!sbuf) goto error;
10549 srelease = 1;
10550 buf1 = _PyUnicode_AsKind(str1, rkind);
10551 if (!buf1) goto error;
10552 release1 = 1;
10553 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010554 u = PyUnicode_New(slen, maxchar);
10555 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010557 assert(PyUnicode_KIND(u) == rkind);
10558 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010559
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010560 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010561 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010562 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010564 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010566
10567 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010568 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010569 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010570 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 if (i == -1)
10572 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010575 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 }
10580 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010582 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 int rkind = skind;
10584 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010587 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 buf1 = _PyUnicode_AsKind(str1, rkind);
10589 if (!buf1) goto error;
10590 release1 = 1;
10591 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593 if (n == 0)
10594 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 buf2 = _PyUnicode_AsKind(str2, rkind);
10598 if (!buf2) goto error;
10599 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010602 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 rkind = kind2;
10604 sbuf = _PyUnicode_AsKind(self, rkind);
10605 if (!sbuf) goto error;
10606 srelease = 1;
10607 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010608 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 buf1 = _PyUnicode_AsKind(str1, rkind);
10610 if (!buf1) goto error;
10611 release1 = 1;
10612 }
10613 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10614 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010615 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 PyErr_SetString(PyExc_OverflowError,
10617 "replace string is too long");
10618 goto error;
10619 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010620 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010622 _Py_INCREF_UNICODE_EMPTY();
10623 if (!unicode_empty)
10624 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 u = unicode_empty;
10626 goto done;
10627 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010628 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 PyErr_SetString(PyExc_OverflowError,
10630 "replace string is too long");
10631 goto error;
10632 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010633 u = PyUnicode_New(new_size, maxchar);
10634 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 assert(PyUnicode_KIND(u) == rkind);
10637 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 ires = i = 0;
10639 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 while (n-- > 0) {
10641 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010642 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010644 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010645 if (j == -1)
10646 break;
10647 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010649 memcpy(res + rkind * ires,
10650 sbuf + rkind * i,
10651 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 }
10654 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 memcpy(res + rkind * ires,
10666 sbuf + rkind * i,
10667 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010668 }
10669 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 /* interleave */
10671 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010672 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 if (--n <= 0)
10677 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010678 memcpy(res + rkind * ires,
10679 sbuf + rkind * i,
10680 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 ires++;
10682 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 memcpy(res + rkind * ires,
10685 sbuf + rkind * i,
10686 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010688 }
10689
10690 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010691 unicode_adjust_maxchar(&u);
10692 if (u == NULL)
10693 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010695
10696 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (srelease)
10698 PyMem_FREE(sbuf);
10699 if (release1)
10700 PyMem_FREE(buf1);
10701 if (release2)
10702 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010703 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (srelease)
10709 PyMem_FREE(sbuf);
10710 if (release1)
10711 PyMem_FREE(buf1);
10712 if (release2)
10713 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010714 return unicode_result_unchanged(self);
10715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 error:
10717 if (srelease && sbuf)
10718 PyMem_FREE(sbuf);
10719 if (release1 && buf1)
10720 PyMem_FREE(buf1);
10721 if (release2 && buf2)
10722 PyMem_FREE(buf2);
10723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724}
10725
10726/* --- Unicode Object Methods --------------------------------------------- */
10727
INADA Naoki3ae20562017-01-16 20:41:20 +090010728/*[clinic input]
10729str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
INADA Naoki3ae20562017-01-16 20:41:20 +090010731Return a version of the string where each word is titlecased.
10732
10733More specifically, words start with uppercased characters and all remaining
10734cased characters have lower case.
10735[clinic start generated code]*/
10736
10737static PyObject *
10738unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010739/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010741 if (PyUnicode_READY(self) == -1)
10742 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010743 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744}
10745
INADA Naoki3ae20562017-01-16 20:41:20 +090010746/*[clinic input]
10747str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
INADA Naoki3ae20562017-01-16 20:41:20 +090010749Return a capitalized version of the string.
10750
10751More specifically, make the first character have upper case and the rest lower
10752case.
10753[clinic start generated code]*/
10754
10755static PyObject *
10756unicode_capitalize_impl(PyObject *self)
10757/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010759 if (PyUnicode_READY(self) == -1)
10760 return NULL;
10761 if (PyUnicode_GET_LENGTH(self) == 0)
10762 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010763 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764}
10765
INADA Naoki3ae20562017-01-16 20:41:20 +090010766/*[clinic input]
10767str.casefold as unicode_casefold
10768
10769Return a version of the string suitable for caseless comparisons.
10770[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010771
10772static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010773unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010774/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010775{
10776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_IS_ASCII(self))
10779 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010780 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010781}
10782
10783
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010784/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010785
10786static int
10787convert_uc(PyObject *obj, void *addr)
10788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010790
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010791 if (!PyUnicode_Check(obj)) {
10792 PyErr_Format(PyExc_TypeError,
10793 "The fill character must be a unicode character, "
10794 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 return 0;
10796 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010797 if (PyUnicode_READY(obj) < 0)
10798 return 0;
10799 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010800 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010802 return 0;
10803 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010804 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010806}
10807
INADA Naoki3ae20562017-01-16 20:41:20 +090010808/*[clinic input]
10809str.center as unicode_center
10810
10811 width: Py_ssize_t
10812 fillchar: Py_UCS4 = ' '
10813 /
10814
10815Return a centered string of length width.
10816
10817Padding is done using the specified fill character (default is a space).
10818[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010821unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10822/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010824 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825
Benjamin Petersonbac79492012-01-14 13:34:47 -050010826 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 return NULL;
10828
Victor Stinnerc4b49542011-12-11 22:44:26 +010010829 if (PyUnicode_GET_LENGTH(self) >= width)
10830 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
Victor Stinnerc4b49542011-12-11 22:44:26 +010010832 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 left = marg / 2 + (marg & width & 1);
10834
Victor Stinner9310abb2011-10-05 00:59:23 +020010835 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836}
10837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838/* This function assumes that str1 and str2 are readied by the caller. */
10839
Marc-André Lemburge5034372000-08-08 08:04:29 +000010840static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010841unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010842{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010843#define COMPARE(TYPE1, TYPE2) \
10844 do { \
10845 TYPE1* p1 = (TYPE1 *)data1; \
10846 TYPE2* p2 = (TYPE2 *)data2; \
10847 TYPE1* end = p1 + len; \
10848 Py_UCS4 c1, c2; \
10849 for (; p1 != end; p1++, p2++) { \
10850 c1 = *p1; \
10851 c2 = *p2; \
10852 if (c1 != c2) \
10853 return (c1 < c2) ? -1 : 1; \
10854 } \
10855 } \
10856 while (0)
10857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 int kind1, kind2;
10859 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010860 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 kind1 = PyUnicode_KIND(str1);
10863 kind2 = PyUnicode_KIND(str2);
10864 data1 = PyUnicode_DATA(str1);
10865 data2 = PyUnicode_DATA(str2);
10866 len1 = PyUnicode_GET_LENGTH(str1);
10867 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010868 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010869
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010870 switch(kind1) {
10871 case PyUnicode_1BYTE_KIND:
10872 {
10873 switch(kind2) {
10874 case PyUnicode_1BYTE_KIND:
10875 {
10876 int cmp = memcmp(data1, data2, len);
10877 /* normalize result of memcmp() into the range [-1; 1] */
10878 if (cmp < 0)
10879 return -1;
10880 if (cmp > 0)
10881 return 1;
10882 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010883 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 case PyUnicode_2BYTE_KIND:
10885 COMPARE(Py_UCS1, Py_UCS2);
10886 break;
10887 case PyUnicode_4BYTE_KIND:
10888 COMPARE(Py_UCS1, Py_UCS4);
10889 break;
10890 default:
10891 assert(0);
10892 }
10893 break;
10894 }
10895 case PyUnicode_2BYTE_KIND:
10896 {
10897 switch(kind2) {
10898 case PyUnicode_1BYTE_KIND:
10899 COMPARE(Py_UCS2, Py_UCS1);
10900 break;
10901 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010902 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 COMPARE(Py_UCS2, Py_UCS2);
10904 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010905 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010906 case PyUnicode_4BYTE_KIND:
10907 COMPARE(Py_UCS2, Py_UCS4);
10908 break;
10909 default:
10910 assert(0);
10911 }
10912 break;
10913 }
10914 case PyUnicode_4BYTE_KIND:
10915 {
10916 switch(kind2) {
10917 case PyUnicode_1BYTE_KIND:
10918 COMPARE(Py_UCS4, Py_UCS1);
10919 break;
10920 case PyUnicode_2BYTE_KIND:
10921 COMPARE(Py_UCS4, Py_UCS2);
10922 break;
10923 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010924 {
10925#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10926 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10927 /* normalize result of wmemcmp() into the range [-1; 1] */
10928 if (cmp < 0)
10929 return -1;
10930 if (cmp > 0)
10931 return 1;
10932#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010934#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010935 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010936 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010937 default:
10938 assert(0);
10939 }
10940 break;
10941 }
10942 default:
10943 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010944 }
10945
Victor Stinner770e19e2012-10-04 22:59:45 +020010946 if (len1 == len2)
10947 return 0;
10948 if (len1 < len2)
10949 return -1;
10950 else
10951 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952
10953#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010954}
10955
Benjamin Peterson621b4302016-09-09 13:54:34 -070010956static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010957unicode_compare_eq(PyObject *str1, PyObject *str2)
10958{
10959 int kind;
10960 void *data1, *data2;
10961 Py_ssize_t len;
10962 int cmp;
10963
Victor Stinnere5567ad2012-10-23 02:48:49 +020010964 len = PyUnicode_GET_LENGTH(str1);
10965 if (PyUnicode_GET_LENGTH(str2) != len)
10966 return 0;
10967 kind = PyUnicode_KIND(str1);
10968 if (PyUnicode_KIND(str2) != kind)
10969 return 0;
10970 data1 = PyUnicode_DATA(str1);
10971 data2 = PyUnicode_DATA(str2);
10972
10973 cmp = memcmp(data1, data2, len * kind);
10974 return (cmp == 0);
10975}
10976
10977
Alexander Belopolsky40018472011-02-26 01:02:56 +000010978int
10979PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10982 if (PyUnicode_READY(left) == -1 ||
10983 PyUnicode_READY(right) == -1)
10984 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010985
10986 /* a string is equal to itself */
10987 if (left == right)
10988 return 0;
10989
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010990 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010992 PyErr_Format(PyExc_TypeError,
10993 "Can't compare %.100s and %.100s",
10994 left->ob_type->tp_name,
10995 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 return -1;
10997}
10998
Martin v. Löwis5b222132007-06-10 09:51:05 +000010999int
11000PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 Py_ssize_t i;
11003 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011005 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006
Victor Stinner910337b2011-10-03 03:20:16 +020011007 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011008 if (!PyUnicode_IS_READY(uni)) {
11009 const wchar_t *ws = _PyUnicode_WSTR(uni);
11010 /* Compare Unicode string and source character set string */
11011 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11012 if (chr != ustr[i])
11013 return (chr < ustr[i]) ? -1 : 1;
11014 }
11015 /* This check keeps Python strings that end in '\0' from comparing equal
11016 to C strings identical up to that point. */
11017 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11018 return 1; /* uni is longer */
11019 if (ustr[i])
11020 return -1; /* str is longer */
11021 return 0;
11022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011024 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011025 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011026 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011027 size_t len, len2 = strlen(str);
11028 int cmp;
11029
11030 len = Py_MIN(len1, len2);
11031 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011032 if (cmp != 0) {
11033 if (cmp < 0)
11034 return -1;
11035 else
11036 return 1;
11037 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011038 if (len1 > len2)
11039 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011040 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011041 return -1; /* str is longer */
11042 return 0;
11043 }
11044 else {
11045 void *data = PyUnicode_DATA(uni);
11046 /* Compare Unicode string and source character set string */
11047 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011048 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11050 /* This check keeps Python strings that end in '\0' from comparing equal
11051 to C strings identical up to that point. */
11052 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11053 return 1; /* uni is longer */
11054 if (str[i])
11055 return -1; /* str is longer */
11056 return 0;
11057 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011058}
11059
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011060static int
11061non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11062{
11063 size_t i, len;
11064 const wchar_t *p;
11065 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11066 if (strlen(str) != len)
11067 return 0;
11068 p = _PyUnicode_WSTR(unicode);
11069 assert(p);
11070 for (i = 0; i < len; i++) {
11071 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011072 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011073 return 0;
11074 }
11075 return 1;
11076}
11077
11078int
11079_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11080{
11081 size_t len;
11082 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011083 assert(str);
11084#ifndef NDEBUG
11085 for (const char *p = str; *p; p++) {
11086 assert((unsigned char)*p < 128);
11087 }
11088#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011089 if (PyUnicode_READY(unicode) == -1) {
11090 /* Memory error or bad data */
11091 PyErr_Clear();
11092 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11093 }
11094 if (!PyUnicode_IS_ASCII(unicode))
11095 return 0;
11096 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11097 return strlen(str) == len &&
11098 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11099}
11100
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011101int
11102_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11103{
11104 PyObject *right_uni;
11105 Py_hash_t hash;
11106
11107 assert(_PyUnicode_CHECK(left));
11108 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011109#ifndef NDEBUG
11110 for (const char *p = right->string; *p; p++) {
11111 assert((unsigned char)*p < 128);
11112 }
11113#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011114
11115 if (PyUnicode_READY(left) == -1) {
11116 /* memory error or bad data */
11117 PyErr_Clear();
11118 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11119 }
11120
11121 if (!PyUnicode_IS_ASCII(left))
11122 return 0;
11123
11124 right_uni = _PyUnicode_FromId(right); /* borrowed */
11125 if (right_uni == NULL) {
11126 /* memory error or bad data */
11127 PyErr_Clear();
11128 return _PyUnicode_EqualToASCIIString(left, right->string);
11129 }
11130
11131 if (left == right_uni)
11132 return 1;
11133
11134 if (PyUnicode_CHECK_INTERNED(left))
11135 return 0;
11136
11137 assert(_PyUnicode_HASH(right_uni) != 1);
11138 hash = _PyUnicode_HASH(left);
11139 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11140 return 0;
11141
11142 return unicode_compare_eq(left, right_uni);
11143}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011144
Benjamin Peterson29060642009-01-31 22:14:21 +000011145#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011146 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011147
Alexander Belopolsky40018472011-02-26 01:02:56 +000011148PyObject *
11149PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011150{
11151 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011152 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011153
Victor Stinnere5567ad2012-10-23 02:48:49 +020011154 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11155 Py_RETURN_NOTIMPLEMENTED;
11156
11157 if (PyUnicode_READY(left) == -1 ||
11158 PyUnicode_READY(right) == -1)
11159 return NULL;
11160
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011161 if (left == right) {
11162 switch (op) {
11163 case Py_EQ:
11164 case Py_LE:
11165 case Py_GE:
11166 /* a string is equal to itself */
11167 v = Py_True;
11168 break;
11169 case Py_NE:
11170 case Py_LT:
11171 case Py_GT:
11172 v = Py_False;
11173 break;
11174 default:
11175 PyErr_BadArgument();
11176 return NULL;
11177 }
11178 }
11179 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011180 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011181 result ^= (op == Py_NE);
11182 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011183 }
11184 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011185 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011186
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011187 /* Convert the return value to a Boolean */
11188 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011189 case Py_LE:
11190 v = TEST_COND(result <= 0);
11191 break;
11192 case Py_GE:
11193 v = TEST_COND(result >= 0);
11194 break;
11195 case Py_LT:
11196 v = TEST_COND(result == -1);
11197 break;
11198 case Py_GT:
11199 v = TEST_COND(result == 1);
11200 break;
11201 default:
11202 PyErr_BadArgument();
11203 return NULL;
11204 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011205 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011206 Py_INCREF(v);
11207 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011208}
11209
Alexander Belopolsky40018472011-02-26 01:02:56 +000011210int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011211_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11212{
11213 return unicode_eq(aa, bb);
11214}
11215
11216int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011218{
Victor Stinner77282cb2013-04-14 19:22:47 +020011219 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 void *buf1, *buf2;
11221 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011222 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011223
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011226 "'in <string>' requires string as left operand, not %.100s",
11227 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011228 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011229 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011230 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011231 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 if (ensure_unicode(str) < 0)
11233 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 kind2 = PyUnicode_KIND(substr);
11237 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 len2 = PyUnicode_GET_LENGTH(substr);
11241 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011242 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011243 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011245 if (len2 == 1) {
11246 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11247 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011248 return result;
11249 }
11250 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011251 buf2 = _PyUnicode_AsKind(substr, kind1);
11252 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255
Victor Stinner77282cb2013-04-14 19:22:47 +020011256 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 case PyUnicode_1BYTE_KIND:
11258 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11259 break;
11260 case PyUnicode_2BYTE_KIND:
11261 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11262 break;
11263 case PyUnicode_4BYTE_KIND:
11264 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11265 break;
11266 default:
11267 result = -1;
11268 assert(0);
11269 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011270
Victor Stinner77282cb2013-04-14 19:22:47 +020011271 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 PyMem_Free(buf2);
11273
Guido van Rossum403d68b2000-03-13 15:55:09 +000011274 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011275}
11276
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277/* Concat to string or Unicode object giving a new Unicode object. */
11278
Alexander Belopolsky40018472011-02-26 01:02:56 +000011279PyObject *
11280PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011283 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011284 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011286 if (ensure_unicode(left) < 0)
11287 return NULL;
11288
11289 if (!PyUnicode_Check(right)) {
11290 PyErr_Format(PyExc_TypeError,
11291 "can only concatenate str (not \"%.200s\") to str",
11292 right->ob_type->tp_name);
11293 return NULL;
11294 }
11295 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297
11298 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 if (left == unicode_empty)
11300 return PyUnicode_FromObject(right);
11301 if (right == unicode_empty)
11302 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 left_len = PyUnicode_GET_LENGTH(left);
11305 right_len = PyUnicode_GET_LENGTH(right);
11306 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011307 PyErr_SetString(PyExc_OverflowError,
11308 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011310 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011311 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011312
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011313 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11314 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011315 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 result = PyUnicode_New(new_len, maxchar);
11319 if (result == NULL)
11320 return NULL;
11321 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11322 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11323 assert(_PyUnicode_CheckConsistency(result, 1));
11324 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325}
11326
Walter Dörwald1ab83302007-05-18 17:15:44 +000011327void
Victor Stinner23e56682011-10-03 03:54:37 +020011328PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011329{
Victor Stinner23e56682011-10-03 03:54:37 +020011330 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011331 Py_UCS4 maxchar, maxchar2;
11332 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011333
11334 if (p_left == NULL) {
11335 if (!PyErr_Occurred())
11336 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011337 return;
11338 }
Victor Stinner23e56682011-10-03 03:54:37 +020011339 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011340 if (right == NULL || left == NULL
11341 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011342 if (!PyErr_Occurred())
11343 PyErr_BadInternalCall();
11344 goto error;
11345 }
11346
Benjamin Petersonbac79492012-01-14 13:34:47 -050011347 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011348 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011349 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011350 goto error;
11351
Victor Stinner488fa492011-12-12 00:01:39 +010011352 /* Shortcuts */
11353 if (left == unicode_empty) {
11354 Py_DECREF(left);
11355 Py_INCREF(right);
11356 *p_left = right;
11357 return;
11358 }
11359 if (right == unicode_empty)
11360 return;
11361
11362 left_len = PyUnicode_GET_LENGTH(left);
11363 right_len = PyUnicode_GET_LENGTH(right);
11364 if (left_len > PY_SSIZE_T_MAX - right_len) {
11365 PyErr_SetString(PyExc_OverflowError,
11366 "strings are too large to concat");
11367 goto error;
11368 }
11369 new_len = left_len + right_len;
11370
11371 if (unicode_modifiable(left)
11372 && PyUnicode_CheckExact(right)
11373 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011374 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11375 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011376 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011377 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011378 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11379 {
11380 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011381 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011382 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011383
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011384 /* copy 'right' into the newly allocated area of 'left' */
11385 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011386 }
Victor Stinner488fa492011-12-12 00:01:39 +010011387 else {
11388 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11389 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011390 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011391
Victor Stinner488fa492011-12-12 00:01:39 +010011392 /* Concat the two Unicode strings */
11393 res = PyUnicode_New(new_len, maxchar);
11394 if (res == NULL)
11395 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011396 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11397 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011398 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011399 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011400 }
11401 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011402 return;
11403
11404error:
Victor Stinner488fa492011-12-12 00:01:39 +010011405 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011406}
11407
11408void
11409PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11410{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011411 PyUnicode_Append(pleft, right);
11412 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011413}
11414
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011415/*
11416Wraps stringlib_parse_args_finds() and additionally ensures that the
11417first argument is a unicode object.
11418*/
11419
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011420static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011421parse_args_finds_unicode(const char * function_name, PyObject *args,
11422 PyObject **substring,
11423 Py_ssize_t *start, Py_ssize_t *end)
11424{
11425 if(stringlib_parse_args_finds(function_name, args, substring,
11426 start, end)) {
11427 if (ensure_unicode(*substring) < 0)
11428 return 0;
11429 return 1;
11430 }
11431 return 0;
11432}
11433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011437Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011438string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011442unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011444 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011445 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011446 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011448 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 void *buf1, *buf2;
11450 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011452 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 kind1 = PyUnicode_KIND(self);
11456 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011458 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 len1 = PyUnicode_GET_LENGTH(self);
11461 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011463 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011466 buf1 = PyUnicode_DATA(self);
11467 buf2 = PyUnicode_DATA(substring);
11468 if (kind2 != kind1) {
11469 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011470 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011471 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011472 }
11473 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 case PyUnicode_1BYTE_KIND:
11475 iresult = ucs1lib_count(
11476 ((Py_UCS1*)buf1) + start, end - start,
11477 buf2, len2, PY_SSIZE_T_MAX
11478 );
11479 break;
11480 case PyUnicode_2BYTE_KIND:
11481 iresult = ucs2lib_count(
11482 ((Py_UCS2*)buf1) + start, end - start,
11483 buf2, len2, PY_SSIZE_T_MAX
11484 );
11485 break;
11486 case PyUnicode_4BYTE_KIND:
11487 iresult = ucs4lib_count(
11488 ((Py_UCS4*)buf1) + start, end - start,
11489 buf2, len2, PY_SSIZE_T_MAX
11490 );
11491 break;
11492 default:
11493 assert(0); iresult = 0;
11494 }
11495
11496 result = PyLong_FromSsize_t(iresult);
11497
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011498 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 return result;
11502}
11503
INADA Naoki3ae20562017-01-16 20:41:20 +090011504/*[clinic input]
11505str.encode as unicode_encode
11506
11507 encoding: str(c_default="NULL") = 'utf-8'
11508 The encoding in which to encode the string.
11509 errors: str(c_default="NULL") = 'strict'
11510 The error handling scheme to use for encoding errors.
11511 The default is 'strict' meaning that encoding errors raise a
11512 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11513 'xmlcharrefreplace' as well as any other name registered with
11514 codecs.register_error that can handle UnicodeEncodeErrors.
11515
11516Encode the string using the codec registered for encoding.
11517[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
11519static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011520unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011521/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011523 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011524}
11525
INADA Naoki3ae20562017-01-16 20:41:20 +090011526/*[clinic input]
11527str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
INADA Naoki3ae20562017-01-16 20:41:20 +090011529 tabsize: int = 8
11530
11531Return a copy where all tab characters are expanded using spaces.
11532
11533If tabsize is not given, a tab size of 8 characters is assumed.
11534[clinic start generated code]*/
11535
11536static PyObject *
11537unicode_expandtabs_impl(PyObject *self, int tabsize)
11538/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 Py_ssize_t i, j, line_pos, src_len, incr;
11541 Py_UCS4 ch;
11542 PyObject *u;
11543 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011544 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011545 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Antoine Pitrou22425222011-10-04 19:10:51 +020011547 if (PyUnicode_READY(self) == -1)
11548 return NULL;
11549
Thomas Wouters7e474022000-07-16 12:04:32 +000011550 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 src_len = PyUnicode_GET_LENGTH(self);
11552 i = j = line_pos = 0;
11553 kind = PyUnicode_KIND(self);
11554 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011555 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 for (; i < src_len; i++) {
11557 ch = PyUnicode_READ(kind, src_data, i);
11558 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011559 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011563 goto overflow;
11564 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011566 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011570 goto overflow;
11571 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011573 if (ch == '\n' || ch == '\r')
11574 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011577 if (!found)
11578 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011579
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011581 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 if (!u)
11583 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
Antoine Pitroue71d5742011-10-04 15:55:09 +020011588 for (; i < src_len; i++) {
11589 ch = PyUnicode_READ(kind, src_data, i);
11590 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 incr = tabsize - (line_pos % tabsize);
11593 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011594 FILL(kind, dest_data, ' ', j, incr);
11595 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 line_pos++;
11600 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011601 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011602 if (ch == '\n' || ch == '\r')
11603 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011605 }
11606 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011607 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011608
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011610 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616\n\
11617Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011618such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619arguments start and end are interpreted as in slice notation.\n\
11620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
11623static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011626 /* initialize variables to prevent gcc warning */
11627 PyObject *substring = NULL;
11628 Py_ssize_t start = 0;
11629 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011632 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011635 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011638 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (result == -2)
11641 return NULL;
11642
Christian Heimes217cfd12007-12-02 14:31:20 +000011643 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644}
11645
11646static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011647unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011649 void *data;
11650 enum PyUnicode_Kind kind;
11651 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011652
11653 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11654 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011656 }
11657 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11658 PyErr_SetString(PyExc_IndexError, "string index out of range");
11659 return NULL;
11660 }
11661 kind = PyUnicode_KIND(self);
11662 data = PyUnicode_DATA(self);
11663 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011664 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665}
11666
Guido van Rossumc2504932007-09-18 19:42:40 +000011667/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011668 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011669static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011670unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671{
Guido van Rossumc2504932007-09-18 19:42:40 +000011672 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011673 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011674
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011675#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011676 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011677#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (_PyUnicode_HASH(self) != -1)
11679 return _PyUnicode_HASH(self);
11680 if (PyUnicode_READY(self) == -1)
11681 return -1;
11682 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011683 /*
11684 We make the hash of the empty string be 0, rather than using
11685 (prefix ^ suffix), since this slightly obfuscates the hash secret
11686 */
11687 if (len == 0) {
11688 _PyUnicode_HASH(self) = 0;
11689 return 0;
11690 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011691 x = _Py_HashBytes(PyUnicode_DATA(self),
11692 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011694 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695}
11696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011697PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011700Return the lowest index in S where substring sub is found, \n\
11701such that sub is contained within S[start:end]. Optional\n\
11702arguments start and end are interpreted as in slice notation.\n\
11703\n\
11704Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
11706static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011709 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011710 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011711 PyObject *substring = NULL;
11712 Py_ssize_t start = 0;
11713 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011715 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011718 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011721 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (result == -2)
11724 return NULL;
11725
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 if (result < 0) {
11727 PyErr_SetString(PyExc_ValueError, "substring not found");
11728 return NULL;
11729 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011730
Christian Heimes217cfd12007-12-02 14:31:20 +000011731 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732}
11733
INADA Naoki3ae20562017-01-16 20:41:20 +090011734/*[clinic input]
11735str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736
INADA Naoki3ae20562017-01-16 20:41:20 +090011737Return True if the string is a lowercase string, False otherwise.
11738
11739A string is lowercase if all cased characters in the string are lowercase and
11740there is at least one cased character in the string.
11741[clinic start generated code]*/
11742
11743static PyObject *
11744unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011745/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 Py_ssize_t i, length;
11748 int kind;
11749 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 int cased;
11751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (PyUnicode_READY(self) == -1)
11753 return NULL;
11754 length = PyUnicode_GET_LENGTH(self);
11755 kind = PyUnicode_KIND(self);
11756 data = PyUnicode_DATA(self);
11757
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (length == 1)
11760 return PyBool_FromLong(
11761 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011763 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011765 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011766
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 for (i = 0; i < length; i++) {
11769 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011770
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011772 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 else if (!cased && Py_UNICODE_ISLOWER(ch))
11774 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011776 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777}
11778
INADA Naoki3ae20562017-01-16 20:41:20 +090011779/*[clinic input]
11780str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
INADA Naoki3ae20562017-01-16 20:41:20 +090011782Return True if the string is an uppercase string, False otherwise.
11783
11784A string is uppercase if all cased characters in the string are uppercase and
11785there is at least one cased character in the string.
11786[clinic start generated code]*/
11787
11788static PyObject *
11789unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011790/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 Py_ssize_t i, length;
11793 int kind;
11794 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 int cased;
11796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 if (PyUnicode_READY(self) == -1)
11798 return NULL;
11799 length = PyUnicode_GET_LENGTH(self);
11800 kind = PyUnicode_KIND(self);
11801 data = PyUnicode_DATA(self);
11802
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (length == 1)
11805 return PyBool_FromLong(
11806 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011808 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011810 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011811
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 for (i = 0; i < length; i++) {
11814 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011815
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011817 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 else if (!cased && Py_UNICODE_ISUPPER(ch))
11819 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011821 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822}
11823
INADA Naoki3ae20562017-01-16 20:41:20 +090011824/*[clinic input]
11825str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827Return True if the string is a title-cased string, False otherwise.
11828
11829In a title-cased string, upper- and title-case characters may only
11830follow uncased characters and lowercase characters only cased ones.
11831[clinic start generated code]*/
11832
11833static PyObject *
11834unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011835/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 Py_ssize_t i, length;
11838 int kind;
11839 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 int cased, previous_is_cased;
11841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (PyUnicode_READY(self) == -1)
11843 return NULL;
11844 length = PyUnicode_GET_LENGTH(self);
11845 kind = PyUnicode_KIND(self);
11846 data = PyUnicode_DATA(self);
11847
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (length == 1) {
11850 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11851 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11852 (Py_UNICODE_ISUPPER(ch) != 0));
11853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011855 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011857 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 cased = 0;
11860 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 for (i = 0; i < length; i++) {
11862 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011863
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11865 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011866 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 previous_is_cased = 1;
11868 cased = 1;
11869 }
11870 else if (Py_UNICODE_ISLOWER(ch)) {
11871 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011872 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 previous_is_cased = 1;
11874 cased = 1;
11875 }
11876 else
11877 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011879 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880}
11881
INADA Naoki3ae20562017-01-16 20:41:20 +090011882/*[clinic input]
11883str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
INADA Naoki3ae20562017-01-16 20:41:20 +090011885Return True if the string is a whitespace string, False otherwise.
11886
11887A string is whitespace if all characters in the string are whitespace and there
11888is at least one character in the string.
11889[clinic start generated code]*/
11890
11891static PyObject *
11892unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011893/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 Py_ssize_t i, length;
11896 int kind;
11897 void *data;
11898
11899 if (PyUnicode_READY(self) == -1)
11900 return NULL;
11901 length = PyUnicode_GET_LENGTH(self);
11902 kind = PyUnicode_KIND(self);
11903 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (length == 1)
11907 return PyBool_FromLong(
11908 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011910 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011912 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 for (i = 0; i < length; i++) {
11915 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011916 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011917 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011919 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920}
11921
INADA Naoki3ae20562017-01-16 20:41:20 +090011922/*[clinic input]
11923str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011924
INADA Naoki3ae20562017-01-16 20:41:20 +090011925Return True if the string is an alphabetic string, False otherwise.
11926
11927A string is alphabetic if all characters in the string are alphabetic and there
11928is at least one character in the string.
11929[clinic start generated code]*/
11930
11931static PyObject *
11932unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011933/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 Py_ssize_t i, length;
11936 int kind;
11937 void *data;
11938
11939 if (PyUnicode_READY(self) == -1)
11940 return NULL;
11941 length = PyUnicode_GET_LENGTH(self);
11942 kind = PyUnicode_KIND(self);
11943 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 if (length == 1)
11947 return PyBool_FromLong(
11948 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
11950 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011952 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 for (i = 0; i < length; i++) {
11955 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011956 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011957 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011958 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959}
11960
INADA Naoki3ae20562017-01-16 20:41:20 +090011961/*[clinic input]
11962str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963
INADA Naoki3ae20562017-01-16 20:41:20 +090011964Return True if the string is an alpha-numeric string, False otherwise.
11965
11966A string is alpha-numeric if all characters in the string are alpha-numeric and
11967there is at least one character in the string.
11968[clinic start generated code]*/
11969
11970static PyObject *
11971unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011972/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 int kind;
11975 void *data;
11976 Py_ssize_t len, i;
11977
11978 if (PyUnicode_READY(self) == -1)
11979 return NULL;
11980
11981 kind = PyUnicode_KIND(self);
11982 data = PyUnicode_DATA(self);
11983 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011984
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (len == 1) {
11987 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11988 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11989 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990
11991 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011993 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 for (i = 0; i < len; i++) {
11996 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011997 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011998 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011999 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012000 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001}
12002
INADA Naoki3ae20562017-01-16 20:41:20 +090012003/*[clinic input]
12004str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
INADA Naoki3ae20562017-01-16 20:41:20 +090012006Return True if the string is a decimal string, False otherwise.
12007
12008A string is a decimal string if all characters in the string are decimal and
12009there is at least one character in the string.
12010[clinic start generated code]*/
12011
12012static PyObject *
12013unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012014/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 Py_ssize_t i, length;
12017 int kind;
12018 void *data;
12019
12020 if (PyUnicode_READY(self) == -1)
12021 return NULL;
12022 length = PyUnicode_GET_LENGTH(self);
12023 kind = PyUnicode_KIND(self);
12024 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (length == 1)
12028 return PyBool_FromLong(
12029 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012031 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012033 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 for (i = 0; i < length; i++) {
12036 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012039 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040}
12041
INADA Naoki3ae20562017-01-16 20:41:20 +090012042/*[clinic input]
12043str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
INADA Naoki3ae20562017-01-16 20:41:20 +090012045Return True if the string is a digit string, False otherwise.
12046
12047A string is a digit string if all characters in the string are digits and there
12048is at least one character in the string.
12049[clinic start generated code]*/
12050
12051static PyObject *
12052unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012053/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t i, length;
12056 int kind;
12057 void *data;
12058
12059 if (PyUnicode_READY(self) == -1)
12060 return NULL;
12061 length = PyUnicode_GET_LENGTH(self);
12062 kind = PyUnicode_KIND(self);
12063 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (length == 1) {
12067 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12068 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012071 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012073 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 for (i = 0; i < length; i++) {
12076 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012079 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080}
12081
INADA Naoki3ae20562017-01-16 20:41:20 +090012082/*[clinic input]
12083str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
INADA Naoki3ae20562017-01-16 20:41:20 +090012085Return True if the string is a numeric string, False otherwise.
12086
12087A string is numeric if all characters in the string are numeric and there is at
12088least one character in the string.
12089[clinic start generated code]*/
12090
12091static PyObject *
12092unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012093/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 Py_ssize_t i, length;
12096 int kind;
12097 void *data;
12098
12099 if (PyUnicode_READY(self) == -1)
12100 return NULL;
12101 length = PyUnicode_GET_LENGTH(self);
12102 kind = PyUnicode_KIND(self);
12103 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (length == 1)
12107 return PyBool_FromLong(
12108 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 for (i = 0; i < length; i++) {
12115 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
Martin v. Löwis47383402007-08-15 07:32:56 +000012121int
12122PyUnicode_IsIdentifier(PyObject *self)
12123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 int kind;
12125 void *data;
12126 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012127 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (PyUnicode_READY(self) == -1) {
12130 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 }
12133
12134 /* Special case for empty strings */
12135 if (PyUnicode_GET_LENGTH(self) == 0)
12136 return 0;
12137 kind = PyUnicode_KIND(self);
12138 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012139
12140 /* PEP 3131 says that the first character must be in
12141 XID_Start and subsequent characters in XID_Continue,
12142 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012144 letters, digits, underscore). However, given the current
12145 definition of XID_Start and XID_Continue, it is sufficient
12146 to check just for these, except that _ must be allowed
12147 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012149 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012150 return 0;
12151
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012152 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012155 return 1;
12156}
12157
INADA Naoki3ae20562017-01-16 20:41:20 +090012158/*[clinic input]
12159str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161Return True if the string is a valid Python identifier, False otherwise.
12162
12163Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12164"class".
12165[clinic start generated code]*/
12166
12167static PyObject *
12168unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012169/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012170{
12171 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12172}
12173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174/*[clinic input]
12175str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177Return True if the string is printable, False otherwise.
12178
12179A string is printable if all of its characters are considered printable in
12180repr() or if it is empty.
12181[clinic start generated code]*/
12182
12183static PyObject *
12184unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012185/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 Py_ssize_t i, length;
12188 int kind;
12189 void *data;
12190
12191 if (PyUnicode_READY(self) == -1)
12192 return NULL;
12193 length = PyUnicode_GET_LENGTH(self);
12194 kind = PyUnicode_KIND(self);
12195 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012196
12197 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (length == 1)
12199 return PyBool_FromLong(
12200 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 for (i = 0; i < length; i++) {
12203 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012204 Py_RETURN_FALSE;
12205 }
12206 }
12207 Py_RETURN_TRUE;
12208}
12209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210/*[clinic input]
12211str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
INADA Naoki3ae20562017-01-16 20:41:20 +090012213 iterable: object
12214 /
12215
12216Concatenate any number of strings.
12217
Martin Panter91a88662017-01-24 00:30:06 +000012218The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012219The result is returned as a new string.
12220
12221Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12222[clinic start generated code]*/
12223
12224static PyObject *
12225unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012226/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227{
INADA Naoki3ae20562017-01-16 20:41:20 +090012228 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229}
12230
Martin v. Löwis18e16552006-02-15 17:27:45 +000012231static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012232unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (PyUnicode_READY(self) == -1)
12235 return -1;
12236 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239/*[clinic input]
12240str.ljust as unicode_ljust
12241
12242 width: Py_ssize_t
12243 fillchar: Py_UCS4 = ' '
12244 /
12245
12246Return a left-justified string of length width.
12247
12248Padding is done using the specified fill character (default is a space).
12249[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250
12251static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012252unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12253/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012255 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
Victor Stinnerc4b49542011-12-11 22:44:26 +010012258 if (PyUnicode_GET_LENGTH(self) >= width)
12259 return unicode_result_unchanged(self);
12260
12261 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262}
12263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264/*[clinic input]
12265str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267Return a copy of the string converted to lowercase.
12268[clinic start generated code]*/
12269
12270static PyObject *
12271unicode_lower_impl(PyObject *self)
12272/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012274 if (PyUnicode_READY(self) == -1)
12275 return NULL;
12276 if (PyUnicode_IS_ASCII(self))
12277 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012278 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281#define LEFTSTRIP 0
12282#define RIGHTSTRIP 1
12283#define BOTHSTRIP 2
12284
12285/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012286static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287
INADA Naoki3ae20562017-01-16 20:41:20 +090012288#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290/* externally visible for str.strip(unicode) */
12291PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 void *data;
12295 int kind;
12296 Py_ssize_t i, j, len;
12297 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012298 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12301 return NULL;
12302
12303 kind = PyUnicode_KIND(self);
12304 data = PyUnicode_DATA(self);
12305 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012306 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12308 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 i = 0;
12312 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012313 while (i < len) {
12314 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12315 if (!BLOOM(sepmask, ch))
12316 break;
12317 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12318 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 i++;
12320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012322
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 j = len;
12324 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012325 j--;
12326 while (j >= i) {
12327 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12328 if (!BLOOM(sepmask, ch))
12329 break;
12330 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12331 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012333 }
12334
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012336 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012337
Victor Stinner7931d9a2011-11-04 00:22:48 +010012338 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339}
12340
12341PyObject*
12342PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12343{
12344 unsigned char *data;
12345 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012346 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347
Victor Stinnerde636f32011-10-01 03:55:54 +020012348 if (PyUnicode_READY(self) == -1)
12349 return NULL;
12350
Victor Stinner684d5fd2012-05-03 02:32:34 +020012351 length = PyUnicode_GET_LENGTH(self);
12352 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012353
Victor Stinner684d5fd2012-05-03 02:32:34 +020012354 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012355 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356
Victor Stinnerde636f32011-10-01 03:55:54 +020012357 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012358 PyErr_SetString(PyExc_IndexError, "string index out of range");
12359 return NULL;
12360 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012361 if (start >= length || end < start)
12362 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012363
Victor Stinner684d5fd2012-05-03 02:32:34 +020012364 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012365 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012366 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012367 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012368 }
12369 else {
12370 kind = PyUnicode_KIND(self);
12371 data = PyUnicode_1BYTE_DATA(self);
12372 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012373 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012374 length);
12375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
12378static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012379do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 Py_ssize_t len, i, j;
12382
12383 if (PyUnicode_READY(self) == -1)
12384 return NULL;
12385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012387
Victor Stinnercc7af722013-04-09 22:39:24 +020012388 if (PyUnicode_IS_ASCII(self)) {
12389 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12390
12391 i = 0;
12392 if (striptype != RIGHTSTRIP) {
12393 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012394 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012395 if (!_Py_ascii_whitespace[ch])
12396 break;
12397 i++;
12398 }
12399 }
12400
12401 j = len;
12402 if (striptype != LEFTSTRIP) {
12403 j--;
12404 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012405 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012406 if (!_Py_ascii_whitespace[ch])
12407 break;
12408 j--;
12409 }
12410 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012411 }
12412 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012413 else {
12414 int kind = PyUnicode_KIND(self);
12415 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416
Victor Stinnercc7af722013-04-09 22:39:24 +020012417 i = 0;
12418 if (striptype != RIGHTSTRIP) {
12419 while (i < len) {
12420 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12421 if (!Py_UNICODE_ISSPACE(ch))
12422 break;
12423 i++;
12424 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012425 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012426
12427 j = len;
12428 if (striptype != LEFTSTRIP) {
12429 j--;
12430 while (j >= i) {
12431 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12432 if (!Py_UNICODE_ISSPACE(ch))
12433 break;
12434 j--;
12435 }
12436 j++;
12437 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439
Victor Stinner7931d9a2011-11-04 00:22:48 +010012440 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441}
12442
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443
12444static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012445do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 if (sep != NULL && sep != Py_None) {
12448 if (PyUnicode_Check(sep))
12449 return _PyUnicode_XStrip(self, striptype, sep);
12450 else {
12451 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 "%s arg must be None or str",
12453 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012454 return NULL;
12455 }
12456 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
Benjamin Peterson14339b62009-01-31 16:36:08 +000012458 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459}
12460
12461
INADA Naoki3ae20562017-01-16 20:41:20 +090012462/*[clinic input]
12463str.strip as unicode_strip
12464
12465 chars: object = None
12466 /
12467
Victor Stinner0c4a8282017-01-17 02:21:47 +010012468Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012469
12470If chars is given and not None, remove characters in chars instead.
12471[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012472
12473static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012474unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012475/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476{
INADA Naoki3ae20562017-01-16 20:41:20 +090012477 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012478}
12479
12480
INADA Naoki3ae20562017-01-16 20:41:20 +090012481/*[clinic input]
12482str.lstrip as unicode_lstrip
12483
12484 chars: object = NULL
12485 /
12486
12487Return a copy of the string with leading whitespace removed.
12488
12489If chars is given and not None, remove characters in chars instead.
12490[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012491
12492static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012493unicode_lstrip_impl(PyObject *self, PyObject *chars)
12494/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495{
INADA Naoki3ae20562017-01-16 20:41:20 +090012496 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012497}
12498
12499
INADA Naoki3ae20562017-01-16 20:41:20 +090012500/*[clinic input]
12501str.rstrip as unicode_rstrip
12502
12503 chars: object = NULL
12504 /
12505
12506Return a copy of the string with trailing whitespace removed.
12507
12508If chars is given and not None, remove characters in chars instead.
12509[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012510
12511static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012512unicode_rstrip_impl(PyObject *self, PyObject *chars)
12513/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514{
INADA Naoki3ae20562017-01-16 20:41:20 +090012515 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516}
12517
12518
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012520unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012522 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Serhiy Storchaka05997252013-01-26 12:14:02 +020012525 if (len < 1)
12526 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
Victor Stinnerc4b49542011-12-11 22:44:26 +010012528 /* no repeat, return original string */
12529 if (len == 1)
12530 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012531
Benjamin Petersonbac79492012-01-14 13:34:47 -050012532 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 return NULL;
12534
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012535 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012536 PyErr_SetString(PyExc_OverflowError,
12537 "repeated string is too long");
12538 return NULL;
12539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012541
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012542 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543 if (!u)
12544 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012545 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 if (PyUnicode_GET_LENGTH(str) == 1) {
12548 const int kind = PyUnicode_KIND(str);
12549 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012550 if (kind == PyUnicode_1BYTE_KIND) {
12551 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012552 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012553 }
12554 else if (kind == PyUnicode_2BYTE_KIND) {
12555 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012556 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012557 ucs2[n] = fill_char;
12558 } else {
12559 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12560 assert(kind == PyUnicode_4BYTE_KIND);
12561 for (n = 0; n < len; ++n)
12562 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 }
12565 else {
12566 /* number of characters copied this far */
12567 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012568 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012570 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012574 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012575 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577 }
12578
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012579 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012580 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
Alexander Belopolsky40018472011-02-26 01:02:56 +000012583PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012584PyUnicode_Replace(PyObject *str,
12585 PyObject *substr,
12586 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012587 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012589 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12590 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012592 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593}
12594
INADA Naoki3ae20562017-01-16 20:41:20 +090012595/*[clinic input]
12596str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597
INADA Naoki3ae20562017-01-16 20:41:20 +090012598 old: unicode
12599 new: unicode
12600 count: Py_ssize_t = -1
12601 Maximum number of occurrences to replace.
12602 -1 (the default value) means replace all occurrences.
12603 /
12604
12605Return a copy with all occurrences of substring old replaced by new.
12606
12607If the optional argument count is given, only the first count occurrences are
12608replaced.
12609[clinic start generated code]*/
12610
12611static PyObject *
12612unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12613 Py_ssize_t count)
12614/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012616 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012618 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619}
12620
Alexander Belopolsky40018472011-02-26 01:02:56 +000012621static PyObject *
12622unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012624 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 Py_ssize_t isize;
12626 Py_ssize_t osize, squote, dquote, i, o;
12627 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012628 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012632 return NULL;
12633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 isize = PyUnicode_GET_LENGTH(unicode);
12635 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 /* Compute length of output, quote characters, and
12638 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012639 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 max = 127;
12641 squote = dquote = 0;
12642 ikind = PyUnicode_KIND(unicode);
12643 for (i = 0; i < isize; i++) {
12644 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012645 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 case '\'': squote++; break;
12648 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012650 incr = 2;
12651 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 default:
12653 /* Fast-path ASCII */
12654 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012655 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012657 ;
12658 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 if (osize > PY_SSIZE_T_MAX - incr) {
12668 PyErr_SetString(PyExc_OverflowError,
12669 "string is too long to generate repr");
12670 return NULL;
12671 }
12672 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 }
12674
12675 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012676 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012678 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 if (dquote)
12680 /* Both squote and dquote present. Use squote,
12681 and escape them */
12682 osize += squote;
12683 else
12684 quote = '"';
12685 }
Victor Stinner55c08782013-04-14 18:45:39 +020012686 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687
12688 repr = PyUnicode_New(osize, max);
12689 if (repr == NULL)
12690 return NULL;
12691 okind = PyUnicode_KIND(repr);
12692 odata = PyUnicode_DATA(repr);
12693
12694 PyUnicode_WRITE(okind, odata, 0, quote);
12695 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012696 if (unchanged) {
12697 _PyUnicode_FastCopyCharacters(repr, 1,
12698 unicode, 0,
12699 isize);
12700 }
12701 else {
12702 for (i = 0, o = 1; i < isize; i++) {
12703 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704
Victor Stinner55c08782013-04-14 18:45:39 +020012705 /* Escape quotes and backslashes */
12706 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012707 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012709 continue;
12710 }
12711
12712 /* Map special whitespace to '\t', \n', '\r' */
12713 if (ch == '\t') {
12714 PyUnicode_WRITE(okind, odata, o++, '\\');
12715 PyUnicode_WRITE(okind, odata, o++, 't');
12716 }
12717 else if (ch == '\n') {
12718 PyUnicode_WRITE(okind, odata, o++, '\\');
12719 PyUnicode_WRITE(okind, odata, o++, 'n');
12720 }
12721 else if (ch == '\r') {
12722 PyUnicode_WRITE(okind, odata, o++, '\\');
12723 PyUnicode_WRITE(okind, odata, o++, 'r');
12724 }
12725
12726 /* Map non-printable US ASCII to '\xhh' */
12727 else if (ch < ' ' || ch == 0x7F) {
12728 PyUnicode_WRITE(okind, odata, o++, '\\');
12729 PyUnicode_WRITE(okind, odata, o++, 'x');
12730 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12731 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12732 }
12733
12734 /* Copy ASCII characters as-is */
12735 else if (ch < 0x7F) {
12736 PyUnicode_WRITE(okind, odata, o++, ch);
12737 }
12738
12739 /* Non-ASCII characters */
12740 else {
12741 /* Map Unicode whitespace and control characters
12742 (categories Z* and C* except ASCII space)
12743 */
12744 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12745 PyUnicode_WRITE(okind, odata, o++, '\\');
12746 /* Map 8-bit characters to '\xhh' */
12747 if (ch <= 0xff) {
12748 PyUnicode_WRITE(okind, odata, o++, 'x');
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12751 }
12752 /* Map 16-bit characters to '\uxxxx' */
12753 else if (ch <= 0xffff) {
12754 PyUnicode_WRITE(okind, odata, o++, 'u');
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12759 }
12760 /* Map 21-bit characters to '\U00xxxxxx' */
12761 else {
12762 PyUnicode_WRITE(okind, odata, o++, 'U');
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12771 }
12772 }
12773 /* Copy characters as-is */
12774 else {
12775 PyUnicode_WRITE(okind, odata, o++, ch);
12776 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012777 }
12778 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012781 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012782 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783}
12784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012785PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787\n\
12788Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012789such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790arguments start and end are interpreted as in slice notation.\n\
12791\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012792Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793
12794static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012797 /* initialize variables to prevent gcc warning */
12798 PyObject *substring = NULL;
12799 Py_ssize_t start = 0;
12800 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012803 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012806 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012809 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 if (result == -2)
12812 return NULL;
12813
Christian Heimes217cfd12007-12-02 14:31:20 +000012814 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815}
12816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012817PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012820Return the highest index in S where substring sub is found,\n\
12821such that sub is contained within S[start:end]. Optional\n\
12822arguments start and end are interpreted as in slice notation.\n\
12823\n\
12824Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825
12826static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012829 /* initialize variables to prevent gcc warning */
12830 PyObject *substring = NULL;
12831 Py_ssize_t start = 0;
12832 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012833 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012835 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012838 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 if (result == -2)
12844 return NULL;
12845
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 if (result < 0) {
12847 PyErr_SetString(PyExc_ValueError, "substring not found");
12848 return NULL;
12849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850
Christian Heimes217cfd12007-12-02 14:31:20 +000012851 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852}
12853
INADA Naoki3ae20562017-01-16 20:41:20 +090012854/*[clinic input]
12855str.rjust as unicode_rjust
12856
12857 width: Py_ssize_t
12858 fillchar: Py_UCS4 = ' '
12859 /
12860
12861Return a right-justified string of length width.
12862
12863Padding is done using the specified fill character (default is a space).
12864[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
12866static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012867unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12868/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012870 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871 return NULL;
12872
Victor Stinnerc4b49542011-12-11 22:44:26 +010012873 if (PyUnicode_GET_LENGTH(self) >= width)
12874 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
Victor Stinnerc4b49542011-12-11 22:44:26 +010012876 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877}
12878
Alexander Belopolsky40018472011-02-26 01:02:56 +000012879PyObject *
12880PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012882 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886}
12887
INADA Naoki3ae20562017-01-16 20:41:20 +090012888/*[clinic input]
12889str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890
INADA Naoki3ae20562017-01-16 20:41:20 +090012891 sep: object = None
12892 The delimiter according which to split the string.
12893 None (the default value) means split according to any whitespace,
12894 and discard empty strings from the result.
12895 maxsplit: Py_ssize_t = -1
12896 Maximum number of splits to do.
12897 -1 (the default value) means no limit.
12898
12899Return a list of the words in the string, using sep as the delimiter string.
12900[clinic start generated code]*/
12901
12902static PyObject *
12903unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12904/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
INADA Naoki3ae20562017-01-16 20:41:20 +090012906 if (sep == Py_None)
12907 return split(self, NULL, maxsplit);
12908 if (PyUnicode_Check(sep))
12909 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012910
12911 PyErr_Format(PyExc_TypeError,
12912 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012913 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915}
12916
Thomas Wouters477c8d52006-05-27 19:21:47 +000012917PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012921 int kind1, kind2;
12922 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012925 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
Victor Stinner14f8f022011-10-05 20:58:25 +020012928 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 len1 = PyUnicode_GET_LENGTH(str_obj);
12931 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 if (kind1 < kind2 || len1 < len2) {
12933 _Py_INCREF_UNICODE_EMPTY();
12934 if (!unicode_empty)
12935 out = NULL;
12936 else {
12937 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12938 Py_DECREF(unicode_empty);
12939 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012940 return out;
12941 }
12942 buf1 = PyUnicode_DATA(str_obj);
12943 buf2 = PyUnicode_DATA(sep_obj);
12944 if (kind2 != kind1) {
12945 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12946 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012947 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012952 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12953 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12954 else
12955 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 break;
12957 case PyUnicode_2BYTE_KIND:
12958 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 case PyUnicode_4BYTE_KIND:
12961 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 break;
12963 default:
12964 assert(0);
12965 out = 0;
12966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012968 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
12971 return out;
12972}
12973
12974
12975PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012976PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012979 int kind1, kind2;
12980 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012982
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012983 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 len1 = PyUnicode_GET_LENGTH(str_obj);
12989 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 if (kind1 < kind2 || len1 < len2) {
12991 _Py_INCREF_UNICODE_EMPTY();
12992 if (!unicode_empty)
12993 out = NULL;
12994 else {
12995 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12996 Py_DECREF(unicode_empty);
12997 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012998 return out;
12999 }
13000 buf1 = PyUnicode_DATA(str_obj);
13001 buf2 = PyUnicode_DATA(sep_obj);
13002 if (kind2 != kind1) {
13003 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13004 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013005 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013010 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13011 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13012 else
13013 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 break;
13015 case PyUnicode_2BYTE_KIND:
13016 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13017 break;
13018 case PyUnicode_4BYTE_KIND:
13019 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13020 break;
13021 default:
13022 assert(0);
13023 out = 0;
13024 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013025
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013026 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013028
13029 return out;
13030}
13031
INADA Naoki3ae20562017-01-16 20:41:20 +090013032/*[clinic input]
13033str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013034
INADA Naoki3ae20562017-01-16 20:41:20 +090013035 sep: object
13036 /
13037
13038Partition the string into three parts using the given separator.
13039
13040This will search for the separator in the string. If the separator is found,
13041returns a 3-tuple containing the part before the separator, the separator
13042itself, and the part after it.
13043
13044If the separator is not found, returns a 3-tuple containing the original string
13045and two empty strings.
13046[clinic start generated code]*/
13047
13048static PyObject *
13049unicode_partition(PyObject *self, PyObject *sep)
13050/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051{
INADA Naoki3ae20562017-01-16 20:41:20 +090013052 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053}
13054
INADA Naoki3ae20562017-01-16 20:41:20 +090013055/*[clinic input]
13056str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057
INADA Naoki3ae20562017-01-16 20:41:20 +090013058Partition the string into three parts using the given separator.
13059
13060This will search for the separator in the string, starting and the end. If
13061the separator is found, returns a 3-tuple containing the part before the
13062separator, the separator itself, and the part after it.
13063
13064If the separator is not found, returns a 3-tuple containing two empty strings
13065and the original string.
13066[clinic start generated code]*/
13067
13068static PyObject *
13069unicode_rpartition(PyObject *self, PyObject *sep)
13070/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013071{
INADA Naoki3ae20562017-01-16 20:41:20 +090013072 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013073}
13074
Alexander Belopolsky40018472011-02-26 01:02:56 +000013075PyObject *
13076PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013077{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013078 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013080
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013081 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013082}
13083
INADA Naoki3ae20562017-01-16 20:41:20 +090013084/*[clinic input]
13085str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086
INADA Naoki3ae20562017-01-16 20:41:20 +090013087Return a list of the words in the string, using sep as the delimiter string.
13088
13089Splits are done starting at the end of the string and working to the front.
13090[clinic start generated code]*/
13091
13092static PyObject *
13093unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13094/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013095{
INADA Naoki3ae20562017-01-16 20:41:20 +090013096 if (sep == Py_None)
13097 return rsplit(self, NULL, maxsplit);
13098 if (PyUnicode_Check(sep))
13099 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013100
13101 PyErr_Format(PyExc_TypeError,
13102 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013103 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013104 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013105}
13106
INADA Naoki3ae20562017-01-16 20:41:20 +090013107/*[clinic input]
13108str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013110 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013111
13112Return a list of the lines in the string, breaking at line boundaries.
13113
13114Line breaks are not included in the resulting list unless keepends is given and
13115true.
13116[clinic start generated code]*/
13117
13118static PyObject *
13119unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013120/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123}
13124
13125static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013126PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013128 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
INADA Naoki3ae20562017-01-16 20:41:20 +090013131/*[clinic input]
13132str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
INADA Naoki3ae20562017-01-16 20:41:20 +090013134Convert uppercase characters to lowercase and lowercase characters to uppercase.
13135[clinic start generated code]*/
13136
13137static PyObject *
13138unicode_swapcase_impl(PyObject *self)
13139/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013141 if (PyUnicode_READY(self) == -1)
13142 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013143 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
13145
Larry Hastings61272b72014-01-07 12:41:53 -080013146/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013147
Larry Hastings31826802013-10-19 00:09:25 -070013148@staticmethod
13149str.maketrans as unicode_maketrans
13150
13151 x: object
13152
13153 y: unicode=NULL
13154
13155 z: unicode=NULL
13156
13157 /
13158
13159Return a translation table usable for str.translate().
13160
13161If there is only one argument, it must be a dictionary mapping Unicode
13162ordinals (integers) or characters to Unicode ordinals, strings or None.
13163Character keys will be then converted to ordinals.
13164If there are two arguments, they must be strings of equal length, and
13165in the resulting dictionary, each character in x will be mapped to the
13166character at the same position in y. If there is a third argument, it
13167must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013168[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013169
Larry Hastings31826802013-10-19 00:09:25 -070013170static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013171unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013172/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013173{
Georg Brandlceee0772007-11-27 23:48:05 +000013174 PyObject *new = NULL, *key, *value;
13175 Py_ssize_t i = 0;
13176 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013177
Georg Brandlceee0772007-11-27 23:48:05 +000013178 new = PyDict_New();
13179 if (!new)
13180 return NULL;
13181 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013182 int x_kind, y_kind, z_kind;
13183 void *x_data, *y_data, *z_data;
13184
Georg Brandlceee0772007-11-27 23:48:05 +000013185 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013186 if (!PyUnicode_Check(x)) {
13187 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13188 "be a string if there is a second argument");
13189 goto err;
13190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013192 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13193 "arguments must have equal length");
13194 goto err;
13195 }
13196 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 x_kind = PyUnicode_KIND(x);
13198 y_kind = PyUnicode_KIND(y);
13199 x_data = PyUnicode_DATA(x);
13200 y_data = PyUnicode_DATA(y);
13201 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13202 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013203 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013204 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013205 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013206 if (!value) {
13207 Py_DECREF(key);
13208 goto err;
13209 }
Georg Brandlceee0772007-11-27 23:48:05 +000013210 res = PyDict_SetItem(new, key, value);
13211 Py_DECREF(key);
13212 Py_DECREF(value);
13213 if (res < 0)
13214 goto err;
13215 }
13216 /* create entries for deleting chars in z */
13217 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 z_kind = PyUnicode_KIND(z);
13219 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013220 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013222 if (!key)
13223 goto err;
13224 res = PyDict_SetItem(new, key, Py_None);
13225 Py_DECREF(key);
13226 if (res < 0)
13227 goto err;
13228 }
13229 }
13230 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 int kind;
13232 void *data;
13233
Georg Brandlceee0772007-11-27 23:48:05 +000013234 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013235 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013236 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13237 "to maketrans it must be a dict");
13238 goto err;
13239 }
13240 /* copy entries into the new dict, converting string keys to int keys */
13241 while (PyDict_Next(x, &i, &key, &value)) {
13242 if (PyUnicode_Check(key)) {
13243 /* convert string keys to integer keys */
13244 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013245 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013246 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13247 "table must be of length 1");
13248 goto err;
13249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 kind = PyUnicode_KIND(key);
13251 data = PyUnicode_DATA(key);
13252 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013253 if (!newkey)
13254 goto err;
13255 res = PyDict_SetItem(new, newkey, value);
13256 Py_DECREF(newkey);
13257 if (res < 0)
13258 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013259 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013260 /* just keep integer keys */
13261 if (PyDict_SetItem(new, key, value) < 0)
13262 goto err;
13263 } else {
13264 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13265 "be strings or integers");
13266 goto err;
13267 }
13268 }
13269 }
13270 return new;
13271 err:
13272 Py_DECREF(new);
13273 return NULL;
13274}
13275
INADA Naoki3ae20562017-01-16 20:41:20 +090013276/*[clinic input]
13277str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
INADA Naoki3ae20562017-01-16 20:41:20 +090013279 table: object
13280 Translation table, which must be a mapping of Unicode ordinals to
13281 Unicode ordinals, strings, or None.
13282 /
13283
13284Replace each character in the string using the given translation table.
13285
13286The table must implement lookup/indexing via __getitem__, for instance a
13287dictionary or list. If this operation raises LookupError, the character is
13288left untouched. Characters mapped to None are deleted.
13289[clinic start generated code]*/
13290
13291static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013293/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296}
13297
INADA Naoki3ae20562017-01-16 20:41:20 +090013298/*[clinic input]
13299str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
INADA Naoki3ae20562017-01-16 20:41:20 +090013301Return a copy of the string converted to uppercase.
13302[clinic start generated code]*/
13303
13304static PyObject *
13305unicode_upper_impl(PyObject *self)
13306/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013308 if (PyUnicode_READY(self) == -1)
13309 return NULL;
13310 if (PyUnicode_IS_ASCII(self))
13311 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013312 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313}
13314
INADA Naoki3ae20562017-01-16 20:41:20 +090013315/*[clinic input]
13316str.zfill as unicode_zfill
13317
13318 width: Py_ssize_t
13319 /
13320
13321Pad a numeric string with zeros on the left, to fill a field of the given width.
13322
13323The string is never truncated.
13324[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325
13326static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013327unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013328/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013330 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013331 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 int kind;
13333 void *data;
13334 Py_UCS4 chr;
13335
Benjamin Petersonbac79492012-01-14 13:34:47 -050013336 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338
Victor Stinnerc4b49542011-12-11 22:44:26 +010013339 if (PyUnicode_GET_LENGTH(self) >= width)
13340 return unicode_result_unchanged(self);
13341
13342 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343
13344 u = pad(self, fill, 0, '0');
13345
Walter Dörwald068325e2002-04-15 13:36:47 +000013346 if (u == NULL)
13347 return NULL;
13348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 kind = PyUnicode_KIND(u);
13350 data = PyUnicode_DATA(u);
13351 chr = PyUnicode_READ(kind, data, fill);
13352
13353 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 PyUnicode_WRITE(kind, data, 0, chr);
13356 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357 }
13358
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013359 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013360 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
13363#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013364static PyObject *
13365unicode__decimal2ascii(PyObject *self)
13366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013368}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369#endif
13370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013371PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013374Return True if S starts with the specified prefix, False otherwise.\n\
13375With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376With optional end, stop comparing S at that position.\n\
13377prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378
13379static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013380unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013384 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013385 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013386 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388
Jesus Ceaac451502011-04-20 17:09:23 +020013389 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013391 if (PyTuple_Check(subobj)) {
13392 Py_ssize_t i;
13393 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 substring = PyTuple_GET_ITEM(subobj, i);
13395 if (!PyUnicode_Check(substring)) {
13396 PyErr_Format(PyExc_TypeError,
13397 "tuple for startswith must only contain str, "
13398 "not %.100s",
13399 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013401 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013403 if (result == -1)
13404 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 if (result) {
13406 Py_RETURN_TRUE;
13407 }
13408 }
13409 /* nothing matched */
13410 Py_RETURN_FALSE;
13411 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013412 if (!PyUnicode_Check(subobj)) {
13413 PyErr_Format(PyExc_TypeError,
13414 "startswith first arg must be str or "
13415 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013417 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013418 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013419 if (result == -1)
13420 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
13424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013425PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013428Return True if S ends with the specified suffix, False otherwise.\n\
13429With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013430With optional end, stop comparing S at that position.\n\
13431suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432
13433static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013434unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013437 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013438 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013439 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013440 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013441 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442
Jesus Ceaac451502011-04-20 17:09:23 +020013443 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013445 if (PyTuple_Check(subobj)) {
13446 Py_ssize_t i;
13447 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013448 substring = PyTuple_GET_ITEM(subobj, i);
13449 if (!PyUnicode_Check(substring)) {
13450 PyErr_Format(PyExc_TypeError,
13451 "tuple for endswith must only contain str, "
13452 "not %.100s",
13453 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013455 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013456 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013457 if (result == -1)
13458 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459 if (result) {
13460 Py_RETURN_TRUE;
13461 }
13462 }
13463 Py_RETURN_FALSE;
13464 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013465 if (!PyUnicode_Check(subobj)) {
13466 PyErr_Format(PyExc_TypeError,
13467 "endswith first arg must be str or "
13468 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013470 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013472 if (result == -1)
13473 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013474 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475}
13476
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013477static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013478_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013479{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013480 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13481 writer->data = PyUnicode_DATA(writer->buffer);
13482
13483 if (!writer->readonly) {
13484 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013485 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013486 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013487 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013488 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13489 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13490 writer->kind = PyUnicode_WCHAR_KIND;
13491 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13492
Victor Stinner8f674cc2013-04-17 23:02:17 +020013493 /* Copy-on-write mode: set buffer size to 0 so
13494 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13495 * next write. */
13496 writer->size = 0;
13497 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013498}
13499
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013501_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013502{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013504
13505 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013506 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013507
13508 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13509 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13510 writer->kind = PyUnicode_WCHAR_KIND;
13511 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013512}
13513
Victor Stinnerd3f08822012-05-29 12:57:52 +020013514int
13515_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13516 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013517{
13518 Py_ssize_t newlen;
13519 PyObject *newbuffer;
13520
Victor Stinner2740e462016-09-06 16:58:36 -070013521 assert(maxchar <= MAX_UNICODE);
13522
Victor Stinnerca9381e2015-09-22 00:58:32 +020013523 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013524 assert((maxchar > writer->maxchar && length >= 0)
13525 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526
Victor Stinner202fdca2012-05-07 12:47:02 +020013527 if (length > PY_SSIZE_T_MAX - writer->pos) {
13528 PyErr_NoMemory();
13529 return -1;
13530 }
13531 newlen = writer->pos + length;
13532
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013533 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013534
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013536 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013537 if (writer->overallocate
13538 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13539 /* overallocate to limit the number of realloc() */
13540 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013542 if (newlen < writer->min_length)
13543 newlen = writer->min_length;
13544
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 writer->buffer = PyUnicode_New(newlen, maxchar);
13546 if (writer->buffer == NULL)
13547 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013548 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013549 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013550 if (writer->overallocate
13551 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13552 /* overallocate to limit the number of realloc() */
13553 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013554 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013555 if (newlen < writer->min_length)
13556 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013558 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013559 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013560 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013561 newbuffer = PyUnicode_New(newlen, maxchar);
13562 if (newbuffer == NULL)
13563 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13565 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013566 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013567 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 }
13569 else {
13570 newbuffer = resize_compact(writer->buffer, newlen);
13571 if (newbuffer == NULL)
13572 return -1;
13573 }
13574 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 }
13576 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013577 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013578 newbuffer = PyUnicode_New(writer->size, maxchar);
13579 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013580 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13582 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013583 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013585 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013586 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013587
13588#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013589}
13590
Victor Stinnerca9381e2015-09-22 00:58:32 +020013591int
13592_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13593 enum PyUnicode_Kind kind)
13594{
13595 Py_UCS4 maxchar;
13596
13597 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13598 assert(writer->kind < kind);
13599
13600 switch (kind)
13601 {
13602 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13603 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13604 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13605 default:
13606 assert(0 && "invalid kind");
13607 return -1;
13608 }
13609
13610 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13611}
13612
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013613static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013614_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013615{
Victor Stinner2740e462016-09-06 16:58:36 -070013616 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013617 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13618 return -1;
13619 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13620 writer->pos++;
13621 return 0;
13622}
13623
13624int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013625_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13626{
13627 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13628}
13629
13630int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13632{
13633 Py_UCS4 maxchar;
13634 Py_ssize_t len;
13635
13636 if (PyUnicode_READY(str) == -1)
13637 return -1;
13638 len = PyUnicode_GET_LENGTH(str);
13639 if (len == 0)
13640 return 0;
13641 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13642 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013643 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013644 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013645 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013646 Py_INCREF(str);
13647 writer->buffer = str;
13648 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 writer->pos += len;
13650 return 0;
13651 }
13652 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13653 return -1;
13654 }
13655 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13656 str, 0, len);
13657 writer->pos += len;
13658 return 0;
13659}
13660
Victor Stinnere215d962012-10-06 23:03:36 +020013661int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013662_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13663 Py_ssize_t start, Py_ssize_t end)
13664{
13665 Py_UCS4 maxchar;
13666 Py_ssize_t len;
13667
13668 if (PyUnicode_READY(str) == -1)
13669 return -1;
13670
13671 assert(0 <= start);
13672 assert(end <= PyUnicode_GET_LENGTH(str));
13673 assert(start <= end);
13674
13675 if (end == 0)
13676 return 0;
13677
13678 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13679 return _PyUnicodeWriter_WriteStr(writer, str);
13680
13681 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13682 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13683 else
13684 maxchar = writer->maxchar;
13685 len = end - start;
13686
13687 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13688 return -1;
13689
13690 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13691 str, start, len);
13692 writer->pos += len;
13693 return 0;
13694}
13695
13696int
Victor Stinner4a587072013-11-19 12:54:53 +010013697_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13698 const char *ascii, Py_ssize_t len)
13699{
13700 if (len == -1)
13701 len = strlen(ascii);
13702
13703 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13704
13705 if (writer->buffer == NULL && !writer->overallocate) {
13706 PyObject *str;
13707
13708 str = _PyUnicode_FromASCII(ascii, len);
13709 if (str == NULL)
13710 return -1;
13711
13712 writer->readonly = 1;
13713 writer->buffer = str;
13714 _PyUnicodeWriter_Update(writer);
13715 writer->pos += len;
13716 return 0;
13717 }
13718
13719 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13720 return -1;
13721
13722 switch (writer->kind)
13723 {
13724 case PyUnicode_1BYTE_KIND:
13725 {
13726 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13727 Py_UCS1 *data = writer->data;
13728
Christian Heimesf051e432016-09-13 20:22:02 +020013729 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013730 break;
13731 }
13732 case PyUnicode_2BYTE_KIND:
13733 {
13734 _PyUnicode_CONVERT_BYTES(
13735 Py_UCS1, Py_UCS2,
13736 ascii, ascii + len,
13737 (Py_UCS2 *)writer->data + writer->pos);
13738 break;
13739 }
13740 case PyUnicode_4BYTE_KIND:
13741 {
13742 _PyUnicode_CONVERT_BYTES(
13743 Py_UCS1, Py_UCS4,
13744 ascii, ascii + len,
13745 (Py_UCS4 *)writer->data + writer->pos);
13746 break;
13747 }
13748 default:
13749 assert(0);
13750 }
13751
13752 writer->pos += len;
13753 return 0;
13754}
13755
13756int
13757_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13758 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013759{
13760 Py_UCS4 maxchar;
13761
13762 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13763 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13764 return -1;
13765 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13766 writer->pos += len;
13767 return 0;
13768}
13769
Victor Stinnerd3f08822012-05-29 12:57:52 +020013770PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013771_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013772{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013773 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013774
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013776 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013777 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013779
13780 str = writer->buffer;
13781 writer->buffer = NULL;
13782
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013783 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013784 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13785 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013787
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013788 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13789 PyObject *str2;
13790 str2 = resize_compact(str, writer->pos);
13791 if (str2 == NULL) {
13792 Py_DECREF(str);
13793 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013794 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013795 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013796 }
13797
Victor Stinner15a0bd32013-07-08 22:29:55 +020013798 assert(_PyUnicode_CheckConsistency(str, 1));
13799 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013800}
13801
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013803_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013804{
13805 Py_CLEAR(writer->buffer);
13806}
13807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013809
13810PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013812\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013813Return a formatted version of S, using substitutions from args and kwargs.\n\
13814The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013815
Eric Smith27bbca62010-11-04 17:06:58 +000013816PyDoc_STRVAR(format_map__doc__,
13817 "S.format_map(mapping) -> str\n\
13818\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013819Return a formatted version of S, using substitutions from mapping.\n\
13820The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013821
INADA Naoki3ae20562017-01-16 20:41:20 +090013822/*[clinic input]
13823str.__format__ as unicode___format__
13824
13825 format_spec: unicode
13826 /
13827
13828Return a formatted version of the string as described by format_spec.
13829[clinic start generated code]*/
13830
Eric Smith4a7d76d2008-05-30 18:10:19 +000013831static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013832unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013833/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013834{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 _PyUnicodeWriter writer;
13836 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013837
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 if (PyUnicode_READY(self) == -1)
13839 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013840 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13842 self, format_spec, 0,
13843 PyUnicode_GET_LENGTH(format_spec));
13844 if (ret == -1) {
13845 _PyUnicodeWriter_Dealloc(&writer);
13846 return NULL;
13847 }
13848 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013849}
13850
INADA Naoki3ae20562017-01-16 20:41:20 +090013851/*[clinic input]
13852str.__sizeof__ as unicode_sizeof
13853
13854Return the size of the string in memory, in bytes.
13855[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013856
13857static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013858unicode_sizeof_impl(PyObject *self)
13859/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 Py_ssize_t size;
13862
13863 /* If it's a compact object, account for base structure +
13864 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 if (PyUnicode_IS_COMPACT_ASCII(self))
13866 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13867 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013869 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 else {
13871 /* If it is a two-block object, account for base object, and
13872 for character block if present. */
13873 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013874 if (_PyUnicode_DATA_ANY(self))
13875 size += (PyUnicode_GET_LENGTH(self) + 1) *
13876 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 }
13878 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013879 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13881 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13882 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13883 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884
13885 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013886}
13887
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013888static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013889unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013890{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013891 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 if (!copy)
13893 return NULL;
13894 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013895}
13896
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 UNICODE_ENCODE_METHODDEF
13899 UNICODE_REPLACE_METHODDEF
13900 UNICODE_SPLIT_METHODDEF
13901 UNICODE_RSPLIT_METHODDEF
13902 UNICODE_JOIN_METHODDEF
13903 UNICODE_CAPITALIZE_METHODDEF
13904 UNICODE_CASEFOLD_METHODDEF
13905 UNICODE_TITLE_METHODDEF
13906 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013911 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013912 UNICODE_LJUST_METHODDEF
13913 UNICODE_LOWER_METHODDEF
13914 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013915 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13916 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013917 UNICODE_RJUST_METHODDEF
13918 UNICODE_RSTRIP_METHODDEF
13919 UNICODE_RPARTITION_METHODDEF
13920 UNICODE_SPLITLINES_METHODDEF
13921 UNICODE_STRIP_METHODDEF
13922 UNICODE_SWAPCASE_METHODDEF
13923 UNICODE_TRANSLATE_METHODDEF
13924 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13926 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013927 UNICODE_ISLOWER_METHODDEF
13928 UNICODE_ISUPPER_METHODDEF
13929 UNICODE_ISTITLE_METHODDEF
13930 UNICODE_ISSPACE_METHODDEF
13931 UNICODE_ISDECIMAL_METHODDEF
13932 UNICODE_ISDIGIT_METHODDEF
13933 UNICODE_ISNUMERIC_METHODDEF
13934 UNICODE_ISALPHA_METHODDEF
13935 UNICODE_ISALNUM_METHODDEF
13936 UNICODE_ISIDENTIFIER_METHODDEF
13937 UNICODE_ISPRINTABLE_METHODDEF
13938 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013939 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013940 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013941 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013942 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013943 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013944#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013945 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013946 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947#endif
13948
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950 {NULL, NULL}
13951};
13952
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013953static PyObject *
13954unicode_mod(PyObject *v, PyObject *w)
13955{
Brian Curtindfc80e32011-08-10 20:28:54 -050013956 if (!PyUnicode_Check(v))
13957 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013958 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013959}
13960
13961static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 0, /*nb_add*/
13963 0, /*nb_subtract*/
13964 0, /*nb_multiply*/
13965 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013966};
13967
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 (lenfunc) unicode_length, /* sq_length */
13970 PyUnicode_Concat, /* sq_concat */
13971 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13972 (ssizeargfunc) unicode_getitem, /* sq_item */
13973 0, /* sq_slice */
13974 0, /* sq_ass_item */
13975 0, /* sq_ass_slice */
13976 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977};
13978
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013979static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013980unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013982 if (PyUnicode_READY(self) == -1)
13983 return NULL;
13984
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013985 if (PyIndex_Check(item)) {
13986 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013987 if (i == -1 && PyErr_Occurred())
13988 return NULL;
13989 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013991 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013992 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013993 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013994 PyObject *result;
13995 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013996 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013997 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014001 return NULL;
14002 }
14003
14004 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014005 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014006 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014007 slicelength == PyUnicode_GET_LENGTH(self)) {
14008 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014009 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014010 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014011 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014012 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014013 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 src_kind = PyUnicode_KIND(self);
14015 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014016 if (!PyUnicode_IS_ASCII(self)) {
14017 kind_limit = kind_maxchar_limit(src_kind);
14018 max_char = 0;
14019 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14020 ch = PyUnicode_READ(src_kind, src_data, cur);
14021 if (ch > max_char) {
14022 max_char = ch;
14023 if (max_char >= kind_limit)
14024 break;
14025 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014026 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014027 }
Victor Stinner55c99112011-10-13 01:17:06 +020014028 else
14029 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014031 if (result == NULL)
14032 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014033 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014034 dest_data = PyUnicode_DATA(result);
14035
14036 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014037 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14038 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014039 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014040 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014041 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014042 } else {
14043 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14044 return NULL;
14045 }
14046}
14047
14048static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 (lenfunc)unicode_length, /* mp_length */
14050 (binaryfunc)unicode_subscript, /* mp_subscript */
14051 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014052};
14053
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055/* Helpers for PyUnicode_Format() */
14056
Victor Stinnera47082312012-10-04 02:19:54 +020014057struct unicode_formatter_t {
14058 PyObject *args;
14059 int args_owned;
14060 Py_ssize_t arglen, argidx;
14061 PyObject *dict;
14062
14063 enum PyUnicode_Kind fmtkind;
14064 Py_ssize_t fmtcnt, fmtpos;
14065 void *fmtdata;
14066 PyObject *fmtstr;
14067
14068 _PyUnicodeWriter writer;
14069};
14070
14071struct unicode_format_arg_t {
14072 Py_UCS4 ch;
14073 int flags;
14074 Py_ssize_t width;
14075 int prec;
14076 int sign;
14077};
14078
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014080unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081{
Victor Stinnera47082312012-10-04 02:19:54 +020014082 Py_ssize_t argidx = ctx->argidx;
14083
14084 if (argidx < ctx->arglen) {
14085 ctx->argidx++;
14086 if (ctx->arglen < 0)
14087 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014088 else
Victor Stinnera47082312012-10-04 02:19:54 +020014089 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090 }
14091 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014092 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093 return NULL;
14094}
14095
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014096/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097
Victor Stinnera47082312012-10-04 02:19:54 +020014098/* Format a float into the writer if the writer is not NULL, or into *p_output
14099 otherwise.
14100
14101 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014102static int
Victor Stinnera47082312012-10-04 02:19:54 +020014103formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14104 PyObject **p_output,
14105 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014106{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014107 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014110 int prec;
14111 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014112
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113 x = PyFloat_AsDouble(v);
14114 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014115 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014116
Victor Stinnera47082312012-10-04 02:19:54 +020014117 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014120
Victor Stinnera47082312012-10-04 02:19:54 +020014121 if (arg->flags & F_ALT)
14122 dtoa_flags = Py_DTSF_ALT;
14123 else
14124 dtoa_flags = 0;
14125 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014126 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014127 return -1;
14128 len = strlen(p);
14129 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014130 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014131 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014132 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014133 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014134 }
14135 else
14136 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014137 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014138 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014139}
14140
Victor Stinnerd0880d52012-04-27 23:40:13 +020014141/* formatlong() emulates the format codes d, u, o, x and X, and
14142 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14143 * Python's regular ints.
14144 * Return value: a new PyUnicodeObject*, or NULL if error.
14145 * The output string is of the form
14146 * "-"? ("0x" | "0X")? digit+
14147 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14148 * set in flags. The case of hex digits will be correct,
14149 * There will be at least prec digits, zero-filled on the left if
14150 * necessary to get that many.
14151 * val object to be converted
14152 * flags bitmask of format flags; only F_ALT is looked at
14153 * prec minimum number of digits; 0-fill on left if needed
14154 * type a character in [duoxX]; u acts the same as d
14155 *
14156 * CAUTION: o, x and X conversions on regular ints can never
14157 * produce a '-' sign, but can for Python's unbounded ints.
14158 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014159PyObject *
14160_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014161{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014162 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014163 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014164 Py_ssize_t i;
14165 int sign; /* 1 if '-', else 0 */
14166 int len; /* number of characters */
14167 Py_ssize_t llen;
14168 int numdigits; /* len == numnondigits + numdigits */
14169 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014170
Victor Stinnerd0880d52012-04-27 23:40:13 +020014171 /* Avoid exceeding SSIZE_T_MAX */
14172 if (prec > INT_MAX-3) {
14173 PyErr_SetString(PyExc_OverflowError,
14174 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 }
14177
14178 assert(PyLong_Check(val));
14179
14180 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014181 default:
14182 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014183 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014184 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014185 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014186 /* int and int subclasses should print numerically when a numeric */
14187 /* format code is used (see issue18780) */
14188 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014189 break;
14190 case 'o':
14191 numnondigits = 2;
14192 result = PyNumber_ToBase(val, 8);
14193 break;
14194 case 'x':
14195 case 'X':
14196 numnondigits = 2;
14197 result = PyNumber_ToBase(val, 16);
14198 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 }
14200 if (!result)
14201 return NULL;
14202
14203 assert(unicode_modifiable(result));
14204 assert(PyUnicode_IS_READY(result));
14205 assert(PyUnicode_IS_ASCII(result));
14206
14207 /* To modify the string in-place, there can only be one reference. */
14208 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014209 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 PyErr_BadInternalCall();
14211 return NULL;
14212 }
14213 buf = PyUnicode_DATA(result);
14214 llen = PyUnicode_GET_LENGTH(result);
14215 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014216 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014217 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014218 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014219 return NULL;
14220 }
14221 len = (int)llen;
14222 sign = buf[0] == '-';
14223 numnondigits += sign;
14224 numdigits = len - numnondigits;
14225 assert(numdigits > 0);
14226
14227 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014228 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014229 (type == 'o' || type == 'x' || type == 'X'))) {
14230 assert(buf[sign] == '0');
14231 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14232 buf[sign+1] == 'o');
14233 numnondigits -= 2;
14234 buf += 2;
14235 len -= 2;
14236 if (sign)
14237 buf[0] = '-';
14238 assert(len == numnondigits + numdigits);
14239 assert(numdigits > 0);
14240 }
14241
14242 /* Fill with leading zeroes to meet minimum width. */
14243 if (prec > numdigits) {
14244 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14245 numnondigits + prec);
14246 char *b1;
14247 if (!r1) {
14248 Py_DECREF(result);
14249 return NULL;
14250 }
14251 b1 = PyBytes_AS_STRING(r1);
14252 for (i = 0; i < numnondigits; ++i)
14253 *b1++ = *buf++;
14254 for (i = 0; i < prec - numdigits; i++)
14255 *b1++ = '0';
14256 for (i = 0; i < numdigits; i++)
14257 *b1++ = *buf++;
14258 *b1 = '\0';
14259 Py_DECREF(result);
14260 result = r1;
14261 buf = PyBytes_AS_STRING(result);
14262 len = numnondigits + prec;
14263 }
14264
14265 /* Fix up case for hex conversions. */
14266 if (type == 'X') {
14267 /* Need to convert all lower case letters to upper case.
14268 and need to convert 0x to 0X (and -0x to -0X). */
14269 for (i = 0; i < len; i++)
14270 if (buf[i] >= 'a' && buf[i] <= 'x')
14271 buf[i] -= 'a'-'A';
14272 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014273 if (!PyUnicode_Check(result)
14274 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014275 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014276 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014277 Py_DECREF(result);
14278 result = unicode;
14279 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 else if (len != PyUnicode_GET_LENGTH(result)) {
14281 if (PyUnicode_Resize(&result, len) < 0)
14282 Py_CLEAR(result);
14283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014284 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014285}
14286
Ethan Furmandf3ed242014-01-05 06:50:30 -080014287/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014289 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 * -1 and raise an exception on error */
14291static int
Victor Stinnera47082312012-10-04 02:19:54 +020014292mainformatlong(PyObject *v,
14293 struct unicode_format_arg_t *arg,
14294 PyObject **p_output,
14295 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014296{
14297 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014298 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299
14300 if (!PyNumber_Check(v))
14301 goto wrongtype;
14302
Ethan Furman9ab74802014-03-21 06:38:46 -070014303 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014305 if (type == 'o' || type == 'x' || type == 'X') {
14306 iobj = PyNumber_Index(v);
14307 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014308 if (PyErr_ExceptionMatches(PyExc_TypeError))
14309 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014310 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014311 }
14312 }
14313 else {
14314 iobj = PyNumber_Long(v);
14315 if (iobj == NULL ) {
14316 if (PyErr_ExceptionMatches(PyExc_TypeError))
14317 goto wrongtype;
14318 return -1;
14319 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014320 }
14321 assert(PyLong_Check(iobj));
14322 }
14323 else {
14324 iobj = v;
14325 Py_INCREF(iobj);
14326 }
14327
14328 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014329 && arg->width == -1 && arg->prec == -1
14330 && !(arg->flags & (F_SIGN | F_BLANK))
14331 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014332 {
14333 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014334 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014335 int base;
14336
Victor Stinnera47082312012-10-04 02:19:54 +020014337 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 {
14339 default:
14340 assert(0 && "'type' not in [diuoxX]");
14341 case 'd':
14342 case 'i':
14343 case 'u':
14344 base = 10;
14345 break;
14346 case 'o':
14347 base = 8;
14348 break;
14349 case 'x':
14350 case 'X':
14351 base = 16;
14352 break;
14353 }
14354
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014355 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14356 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014358 }
14359 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014360 return 1;
14361 }
14362
Ethan Furmanb95b5612015-01-23 20:05:18 -080014363 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014364 Py_DECREF(iobj);
14365 if (res == NULL)
14366 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014367 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 return 0;
14369
14370wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014371 switch(type)
14372 {
14373 case 'o':
14374 case 'x':
14375 case 'X':
14376 PyErr_Format(PyExc_TypeError,
14377 "%%%c format: an integer is required, "
14378 "not %.200s",
14379 type, Py_TYPE(v)->tp_name);
14380 break;
14381 default:
14382 PyErr_Format(PyExc_TypeError,
14383 "%%%c format: a number is required, "
14384 "not %.200s",
14385 type, Py_TYPE(v)->tp_name);
14386 break;
14387 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014388 return -1;
14389}
14390
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014391static Py_UCS4
14392formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014394 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014395 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014396 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014397 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014398 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 goto onError;
14400 }
14401 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014402 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014404 /* make sure number is a type of integer */
14405 if (!PyLong_Check(v)) {
14406 iobj = PyNumber_Index(v);
14407 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014408 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014409 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014410 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014411 Py_DECREF(iobj);
14412 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014413 else {
14414 x = PyLong_AsLong(v);
14415 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 if (x == -1 && PyErr_Occurred())
14417 goto onError;
14418
Victor Stinner8faf8212011-12-08 22:14:11 +010014419 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 PyErr_SetString(PyExc_OverflowError,
14421 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014422 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014423 }
14424
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014425 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014426 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014427
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014429 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014431 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432}
14433
Victor Stinnera47082312012-10-04 02:19:54 +020014434/* Parse options of an argument: flags, width, precision.
14435 Handle also "%(name)" syntax.
14436
14437 Return 0 if the argument has been formatted into arg->str.
14438 Return 1 if the argument has been written into ctx->writer,
14439 Raise an exception and return -1 on error. */
14440static int
14441unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14442 struct unicode_format_arg_t *arg)
14443{
14444#define FORMAT_READ(ctx) \
14445 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14446
14447 PyObject *v;
14448
Victor Stinnera47082312012-10-04 02:19:54 +020014449 if (arg->ch == '(') {
14450 /* Get argument value from a dictionary. Example: "%(name)s". */
14451 Py_ssize_t keystart;
14452 Py_ssize_t keylen;
14453 PyObject *key;
14454 int pcount = 1;
14455
14456 if (ctx->dict == NULL) {
14457 PyErr_SetString(PyExc_TypeError,
14458 "format requires a mapping");
14459 return -1;
14460 }
14461 ++ctx->fmtpos;
14462 --ctx->fmtcnt;
14463 keystart = ctx->fmtpos;
14464 /* Skip over balanced parentheses */
14465 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14466 arg->ch = FORMAT_READ(ctx);
14467 if (arg->ch == ')')
14468 --pcount;
14469 else if (arg->ch == '(')
14470 ++pcount;
14471 ctx->fmtpos++;
14472 }
14473 keylen = ctx->fmtpos - keystart - 1;
14474 if (ctx->fmtcnt < 0 || pcount > 0) {
14475 PyErr_SetString(PyExc_ValueError,
14476 "incomplete format key");
14477 return -1;
14478 }
14479 key = PyUnicode_Substring(ctx->fmtstr,
14480 keystart, keystart + keylen);
14481 if (key == NULL)
14482 return -1;
14483 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014484 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014485 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014486 }
14487 ctx->args = PyObject_GetItem(ctx->dict, key);
14488 Py_DECREF(key);
14489 if (ctx->args == NULL)
14490 return -1;
14491 ctx->args_owned = 1;
14492 ctx->arglen = -1;
14493 ctx->argidx = -2;
14494 }
14495
14496 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014497 while (--ctx->fmtcnt >= 0) {
14498 arg->ch = FORMAT_READ(ctx);
14499 ctx->fmtpos++;
14500 switch (arg->ch) {
14501 case '-': arg->flags |= F_LJUST; continue;
14502 case '+': arg->flags |= F_SIGN; continue;
14503 case ' ': arg->flags |= F_BLANK; continue;
14504 case '#': arg->flags |= F_ALT; continue;
14505 case '0': arg->flags |= F_ZERO; continue;
14506 }
14507 break;
14508 }
14509
14510 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014511 if (arg->ch == '*') {
14512 v = unicode_format_getnextarg(ctx);
14513 if (v == NULL)
14514 return -1;
14515 if (!PyLong_Check(v)) {
14516 PyErr_SetString(PyExc_TypeError,
14517 "* wants int");
14518 return -1;
14519 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014520 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014521 if (arg->width == -1 && PyErr_Occurred())
14522 return -1;
14523 if (arg->width < 0) {
14524 arg->flags |= F_LJUST;
14525 arg->width = -arg->width;
14526 }
14527 if (--ctx->fmtcnt >= 0) {
14528 arg->ch = FORMAT_READ(ctx);
14529 ctx->fmtpos++;
14530 }
14531 }
14532 else if (arg->ch >= '0' && arg->ch <= '9') {
14533 arg->width = arg->ch - '0';
14534 while (--ctx->fmtcnt >= 0) {
14535 arg->ch = FORMAT_READ(ctx);
14536 ctx->fmtpos++;
14537 if (arg->ch < '0' || arg->ch > '9')
14538 break;
14539 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14540 mixing signed and unsigned comparison. Since arg->ch is between
14541 '0' and '9', casting to int is safe. */
14542 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14543 PyErr_SetString(PyExc_ValueError,
14544 "width too big");
14545 return -1;
14546 }
14547 arg->width = arg->width*10 + (arg->ch - '0');
14548 }
14549 }
14550
14551 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014552 if (arg->ch == '.') {
14553 arg->prec = 0;
14554 if (--ctx->fmtcnt >= 0) {
14555 arg->ch = FORMAT_READ(ctx);
14556 ctx->fmtpos++;
14557 }
14558 if (arg->ch == '*') {
14559 v = unicode_format_getnextarg(ctx);
14560 if (v == NULL)
14561 return -1;
14562 if (!PyLong_Check(v)) {
14563 PyErr_SetString(PyExc_TypeError,
14564 "* wants int");
14565 return -1;
14566 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014567 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014568 if (arg->prec == -1 && PyErr_Occurred())
14569 return -1;
14570 if (arg->prec < 0)
14571 arg->prec = 0;
14572 if (--ctx->fmtcnt >= 0) {
14573 arg->ch = FORMAT_READ(ctx);
14574 ctx->fmtpos++;
14575 }
14576 }
14577 else if (arg->ch >= '0' && arg->ch <= '9') {
14578 arg->prec = arg->ch - '0';
14579 while (--ctx->fmtcnt >= 0) {
14580 arg->ch = FORMAT_READ(ctx);
14581 ctx->fmtpos++;
14582 if (arg->ch < '0' || arg->ch > '9')
14583 break;
14584 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14585 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014586 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014587 return -1;
14588 }
14589 arg->prec = arg->prec*10 + (arg->ch - '0');
14590 }
14591 }
14592 }
14593
14594 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14595 if (ctx->fmtcnt >= 0) {
14596 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14597 if (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 }
14601 }
14602 }
14603 if (ctx->fmtcnt < 0) {
14604 PyErr_SetString(PyExc_ValueError,
14605 "incomplete format");
14606 return -1;
14607 }
14608 return 0;
14609
14610#undef FORMAT_READ
14611}
14612
14613/* Format one argument. Supported conversion specifiers:
14614
14615 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014616 - "i", "d", "u": int or float
14617 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014618 - "e", "E", "f", "F", "g", "G": float
14619 - "c": int or str (1 character)
14620
Victor Stinner8dbd4212012-12-04 09:30:24 +010014621 When possible, the output is written directly into the Unicode writer
14622 (ctx->writer). A string is created when padding is required.
14623
Victor Stinnera47082312012-10-04 02:19:54 +020014624 Return 0 if the argument has been formatted into *p_str,
14625 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014626 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014627static int
14628unicode_format_arg_format(struct unicode_formatter_t *ctx,
14629 struct unicode_format_arg_t *arg,
14630 PyObject **p_str)
14631{
14632 PyObject *v;
14633 _PyUnicodeWriter *writer = &ctx->writer;
14634
14635 if (ctx->fmtcnt == 0)
14636 ctx->writer.overallocate = 0;
14637
Victor Stinnera47082312012-10-04 02:19:54 +020014638 v = unicode_format_getnextarg(ctx);
14639 if (v == NULL)
14640 return -1;
14641
Victor Stinnera47082312012-10-04 02:19:54 +020014642
14643 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014644 case 's':
14645 case 'r':
14646 case 'a':
14647 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14648 /* Fast path */
14649 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14650 return -1;
14651 return 1;
14652 }
14653
14654 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14655 *p_str = v;
14656 Py_INCREF(*p_str);
14657 }
14658 else {
14659 if (arg->ch == 's')
14660 *p_str = PyObject_Str(v);
14661 else if (arg->ch == 'r')
14662 *p_str = PyObject_Repr(v);
14663 else
14664 *p_str = PyObject_ASCII(v);
14665 }
14666 break;
14667
14668 case 'i':
14669 case 'd':
14670 case 'u':
14671 case 'o':
14672 case 'x':
14673 case 'X':
14674 {
14675 int ret = mainformatlong(v, arg, p_str, writer);
14676 if (ret != 0)
14677 return ret;
14678 arg->sign = 1;
14679 break;
14680 }
14681
14682 case 'e':
14683 case 'E':
14684 case 'f':
14685 case 'F':
14686 case 'g':
14687 case 'G':
14688 if (arg->width == -1 && arg->prec == -1
14689 && !(arg->flags & (F_SIGN | F_BLANK)))
14690 {
14691 /* Fast path */
14692 if (formatfloat(v, arg, NULL, writer) == -1)
14693 return -1;
14694 return 1;
14695 }
14696
14697 arg->sign = 1;
14698 if (formatfloat(v, arg, p_str, NULL) == -1)
14699 return -1;
14700 break;
14701
14702 case 'c':
14703 {
14704 Py_UCS4 ch = formatchar(v);
14705 if (ch == (Py_UCS4) -1)
14706 return -1;
14707 if (arg->width == -1 && arg->prec == -1) {
14708 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014709 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014710 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014711 return 1;
14712 }
14713 *p_str = PyUnicode_FromOrdinal(ch);
14714 break;
14715 }
14716
14717 default:
14718 PyErr_Format(PyExc_ValueError,
14719 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014720 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014721 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14722 (int)arg->ch,
14723 ctx->fmtpos - 1);
14724 return -1;
14725 }
14726 if (*p_str == NULL)
14727 return -1;
14728 assert (PyUnicode_Check(*p_str));
14729 return 0;
14730}
14731
14732static int
14733unicode_format_arg_output(struct unicode_formatter_t *ctx,
14734 struct unicode_format_arg_t *arg,
14735 PyObject *str)
14736{
14737 Py_ssize_t len;
14738 enum PyUnicode_Kind kind;
14739 void *pbuf;
14740 Py_ssize_t pindex;
14741 Py_UCS4 signchar;
14742 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014743 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014744 Py_ssize_t sublen;
14745 _PyUnicodeWriter *writer = &ctx->writer;
14746 Py_UCS4 fill;
14747
14748 fill = ' ';
14749 if (arg->sign && arg->flags & F_ZERO)
14750 fill = '0';
14751
14752 if (PyUnicode_READY(str) == -1)
14753 return -1;
14754
14755 len = PyUnicode_GET_LENGTH(str);
14756 if ((arg->width == -1 || arg->width <= len)
14757 && (arg->prec == -1 || arg->prec >= len)
14758 && !(arg->flags & (F_SIGN | F_BLANK)))
14759 {
14760 /* Fast path */
14761 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14762 return -1;
14763 return 0;
14764 }
14765
14766 /* Truncate the string for "s", "r" and "a" formats
14767 if the precision is set */
14768 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14769 if (arg->prec >= 0 && len > arg->prec)
14770 len = arg->prec;
14771 }
14772
14773 /* Adjust sign and width */
14774 kind = PyUnicode_KIND(str);
14775 pbuf = PyUnicode_DATA(str);
14776 pindex = 0;
14777 signchar = '\0';
14778 if (arg->sign) {
14779 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14780 if (ch == '-' || ch == '+') {
14781 signchar = ch;
14782 len--;
14783 pindex++;
14784 }
14785 else if (arg->flags & F_SIGN)
14786 signchar = '+';
14787 else if (arg->flags & F_BLANK)
14788 signchar = ' ';
14789 else
14790 arg->sign = 0;
14791 }
14792 if (arg->width < len)
14793 arg->width = len;
14794
14795 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014796 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014797 if (!(arg->flags & F_LJUST)) {
14798 if (arg->sign) {
14799 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014800 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014801 }
14802 else {
14803 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014804 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014805 }
14806 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014807 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14808 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014809 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014810 }
14811
Victor Stinnera47082312012-10-04 02:19:54 +020014812 buflen = arg->width;
14813 if (arg->sign && len == arg->width)
14814 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014815 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014816 return -1;
14817
14818 /* Write the sign if needed */
14819 if (arg->sign) {
14820 if (fill != ' ') {
14821 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14822 writer->pos += 1;
14823 }
14824 if (arg->width > len)
14825 arg->width--;
14826 }
14827
14828 /* Write the numeric prefix for "x", "X" and "o" formats
14829 if the alternate form is used.
14830 For example, write "0x" for the "%#x" format. */
14831 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14832 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14833 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14834 if (fill != ' ') {
14835 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14836 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14837 writer->pos += 2;
14838 pindex += 2;
14839 }
14840 arg->width -= 2;
14841 if (arg->width < 0)
14842 arg->width = 0;
14843 len -= 2;
14844 }
14845
14846 /* Pad left with the fill character if needed */
14847 if (arg->width > len && !(arg->flags & F_LJUST)) {
14848 sublen = arg->width - len;
14849 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14850 writer->pos += sublen;
14851 arg->width = len;
14852 }
14853
14854 /* If padding with spaces: write sign if needed and/or numeric prefix if
14855 the alternate form is used */
14856 if (fill == ' ') {
14857 if (arg->sign) {
14858 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14859 writer->pos += 1;
14860 }
14861 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14862 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14863 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14864 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14865 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14866 writer->pos += 2;
14867 pindex += 2;
14868 }
14869 }
14870
14871 /* Write characters */
14872 if (len) {
14873 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14874 str, pindex, len);
14875 writer->pos += len;
14876 }
14877
14878 /* Pad right with the fill character if needed */
14879 if (arg->width > len) {
14880 sublen = arg->width - len;
14881 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14882 writer->pos += sublen;
14883 }
14884 return 0;
14885}
14886
14887/* Helper of PyUnicode_Format(): format one arg.
14888 Return 0 on success, raise an exception and return -1 on error. */
14889static int
14890unicode_format_arg(struct unicode_formatter_t *ctx)
14891{
14892 struct unicode_format_arg_t arg;
14893 PyObject *str;
14894 int ret;
14895
Victor Stinner8dbd4212012-12-04 09:30:24 +010014896 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014897 if (arg.ch == '%') {
14898 ctx->fmtpos++;
14899 ctx->fmtcnt--;
14900 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14901 return -1;
14902 return 0;
14903 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014904 arg.flags = 0;
14905 arg.width = -1;
14906 arg.prec = -1;
14907 arg.sign = 0;
14908 str = NULL;
14909
Victor Stinnera47082312012-10-04 02:19:54 +020014910 ret = unicode_format_arg_parse(ctx, &arg);
14911 if (ret == -1)
14912 return -1;
14913
14914 ret = unicode_format_arg_format(ctx, &arg, &str);
14915 if (ret == -1)
14916 return -1;
14917
14918 if (ret != 1) {
14919 ret = unicode_format_arg_output(ctx, &arg, str);
14920 Py_DECREF(str);
14921 if (ret == -1)
14922 return -1;
14923 }
14924
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014925 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014926 PyErr_SetString(PyExc_TypeError,
14927 "not all arguments converted during string formatting");
14928 return -1;
14929 }
14930 return 0;
14931}
14932
Alexander Belopolsky40018472011-02-26 01:02:56 +000014933PyObject *
14934PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935{
Victor Stinnera47082312012-10-04 02:19:54 +020014936 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014937
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014939 PyErr_BadInternalCall();
14940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014941 }
Victor Stinnera47082312012-10-04 02:19:54 +020014942
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014943 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014944 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014945
14946 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014947 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14948 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14949 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14950 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014951
Victor Stinner8f674cc2013-04-17 23:02:17 +020014952 _PyUnicodeWriter_Init(&ctx.writer);
14953 ctx.writer.min_length = ctx.fmtcnt + 100;
14954 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014955
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014957 ctx.arglen = PyTuple_Size(args);
14958 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014959 }
14960 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014961 ctx.arglen = -1;
14962 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014963 }
Victor Stinnera47082312012-10-04 02:19:54 +020014964 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014965 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.dict = args;
14967 else
14968 ctx.dict = NULL;
14969 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970
Victor Stinnera47082312012-10-04 02:19:54 +020014971 while (--ctx.fmtcnt >= 0) {
14972 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014973 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014974
14975 nonfmtpos = ctx.fmtpos++;
14976 while (ctx.fmtcnt >= 0 &&
14977 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14978 ctx.fmtpos++;
14979 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 }
Victor Stinnera47082312012-10-04 02:19:54 +020014981 if (ctx.fmtcnt < 0) {
14982 ctx.fmtpos--;
14983 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014984 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014985
Victor Stinnercfc4c132013-04-03 01:48:39 +020014986 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14987 nonfmtpos, ctx.fmtpos) < 0)
14988 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014989 }
14990 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014991 ctx.fmtpos++;
14992 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014993 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014994 }
14995 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014996
Victor Stinnera47082312012-10-04 02:19:54 +020014997 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014998 PyErr_SetString(PyExc_TypeError,
14999 "not all arguments converted during string formatting");
15000 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015001 }
15002
Victor Stinnera47082312012-10-04 02:19:54 +020015003 if (ctx.args_owned) {
15004 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015005 }
Victor Stinnera47082312012-10-04 02:19:54 +020015006 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015007
Benjamin Peterson29060642009-01-31 22:14:21 +000015008 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015009 _PyUnicodeWriter_Dealloc(&ctx.writer);
15010 if (ctx.args_owned) {
15011 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015012 }
15013 return NULL;
15014}
15015
Jeremy Hylton938ace62002-07-17 16:30:39 +000015016static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015017unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15018
Tim Peters6d6c1a32001-08-02 04:15:00 +000015019static PyObject *
15020unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15021{
Benjamin Peterson29060642009-01-31 22:14:21 +000015022 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015023 static char *kwlist[] = {"object", "encoding", "errors", 0};
15024 char *encoding = NULL;
15025 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015026
Benjamin Peterson14339b62009-01-31 16:36:08 +000015027 if (type != &PyUnicode_Type)
15028 return unicode_subtype_new(type, args, kwds);
15029 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015030 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 return NULL;
15032 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015033 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 if (encoding == NULL && errors == NULL)
15035 return PyObject_Str(x);
15036 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015037 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015038}
15039
Guido van Rossume023fe02001-08-30 03:12:59 +000015040static PyObject *
15041unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15042{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015043 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015044 Py_ssize_t length, char_size;
15045 int share_wstr, share_utf8;
15046 unsigned int kind;
15047 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015048
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015051 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015054 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015055 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015056 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015058 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015059
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015060 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061 if (self == NULL) {
15062 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 return NULL;
15064 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015065 kind = PyUnicode_KIND(unicode);
15066 length = PyUnicode_GET_LENGTH(unicode);
15067
15068 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015069#ifdef Py_DEBUG
15070 _PyUnicode_HASH(self) = -1;
15071#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015072 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015073#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015074 _PyUnicode_STATE(self).interned = 0;
15075 _PyUnicode_STATE(self).kind = kind;
15076 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015077 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015078 _PyUnicode_STATE(self).ready = 1;
15079 _PyUnicode_WSTR(self) = NULL;
15080 _PyUnicode_UTF8_LENGTH(self) = 0;
15081 _PyUnicode_UTF8(self) = NULL;
15082 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015083 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015084
15085 share_utf8 = 0;
15086 share_wstr = 0;
15087 if (kind == PyUnicode_1BYTE_KIND) {
15088 char_size = 1;
15089 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15090 share_utf8 = 1;
15091 }
15092 else if (kind == PyUnicode_2BYTE_KIND) {
15093 char_size = 2;
15094 if (sizeof(wchar_t) == 2)
15095 share_wstr = 1;
15096 }
15097 else {
15098 assert(kind == PyUnicode_4BYTE_KIND);
15099 char_size = 4;
15100 if (sizeof(wchar_t) == 4)
15101 share_wstr = 1;
15102 }
15103
15104 /* Ensure we won't overflow the length. */
15105 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15106 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015109 data = PyObject_MALLOC((length + 1) * char_size);
15110 if (data == NULL) {
15111 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 goto onError;
15113 }
15114
Victor Stinnerc3c74152011-10-02 20:39:55 +020015115 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015116 if (share_utf8) {
15117 _PyUnicode_UTF8_LENGTH(self) = length;
15118 _PyUnicode_UTF8(self) = data;
15119 }
15120 if (share_wstr) {
15121 _PyUnicode_WSTR_LENGTH(self) = length;
15122 _PyUnicode_WSTR(self) = (wchar_t *)data;
15123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015124
Christian Heimesf051e432016-09-13 20:22:02 +020015125 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015126 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015127 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015128#ifdef Py_DEBUG
15129 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15130#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015131 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015132 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015133
15134onError:
15135 Py_DECREF(unicode);
15136 Py_DECREF(self);
15137 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015138}
15139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015140PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015141"str(object='') -> str\n\
15142str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015143\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015144Create a new string object from the given object. If encoding or\n\
15145errors is specified, then the object must expose a data buffer\n\
15146that will be decoded using the given encoding and error handler.\n\
15147Otherwise, returns the result of object.__str__() (if defined)\n\
15148or repr(object).\n\
15149encoding defaults to sys.getdefaultencoding().\n\
15150errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015151
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015152static PyObject *unicode_iter(PyObject *seq);
15153
Guido van Rossumd57fd912000-03-10 22:53:23 +000015154PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015155 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 "str", /* tp_name */
15157 sizeof(PyUnicodeObject), /* tp_size */
15158 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015159 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 (destructor)unicode_dealloc, /* tp_dealloc */
15161 0, /* tp_print */
15162 0, /* tp_getattr */
15163 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015164 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015165 unicode_repr, /* tp_repr */
15166 &unicode_as_number, /* tp_as_number */
15167 &unicode_as_sequence, /* tp_as_sequence */
15168 &unicode_as_mapping, /* tp_as_mapping */
15169 (hashfunc) unicode_hash, /* tp_hash*/
15170 0, /* tp_call*/
15171 (reprfunc) unicode_str, /* tp_str */
15172 PyObject_GenericGetAttr, /* tp_getattro */
15173 0, /* tp_setattro */
15174 0, /* tp_as_buffer */
15175 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015176 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 unicode_doc, /* tp_doc */
15178 0, /* tp_traverse */
15179 0, /* tp_clear */
15180 PyUnicode_RichCompare, /* tp_richcompare */
15181 0, /* tp_weaklistoffset */
15182 unicode_iter, /* tp_iter */
15183 0, /* tp_iternext */
15184 unicode_methods, /* tp_methods */
15185 0, /* tp_members */
15186 0, /* tp_getset */
15187 &PyBaseObject_Type, /* tp_base */
15188 0, /* tp_dict */
15189 0, /* tp_descr_get */
15190 0, /* tp_descr_set */
15191 0, /* tp_dictoffset */
15192 0, /* tp_init */
15193 0, /* tp_alloc */
15194 unicode_new, /* tp_new */
15195 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015196};
15197
15198/* Initialize the Unicode implementation */
15199
Victor Stinner3a50e702011-10-18 21:21:00 +020015200int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015201{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015202 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015203 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015204 0x000A, /* LINE FEED */
15205 0x000D, /* CARRIAGE RETURN */
15206 0x001C, /* FILE SEPARATOR */
15207 0x001D, /* GROUP SEPARATOR */
15208 0x001E, /* RECORD SEPARATOR */
15209 0x0085, /* NEXT LINE */
15210 0x2028, /* LINE SEPARATOR */
15211 0x2029, /* PARAGRAPH SEPARATOR */
15212 };
15213
Fred Drakee4315f52000-05-09 19:53:39 +000015214 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015215 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015216 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015217 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015218 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015219
Guido van Rossumcacfc072002-05-24 19:01:59 +000015220 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015221 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015222
15223 /* initialize the linebreak bloom filter */
15224 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015225 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015226 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015227
Christian Heimes26532f72013-07-20 14:57:16 +020015228 if (PyType_Ready(&EncodingMapType) < 0)
15229 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015230
Benjamin Petersonc4311282012-10-30 23:21:10 -040015231 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15232 Py_FatalError("Can't initialize field name iterator type");
15233
15234 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15235 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015236
Victor Stinner3a50e702011-10-18 21:21:00 +020015237 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015238}
15239
15240/* Finalize the Unicode implementation */
15241
Christian Heimesa156e092008-02-16 07:38:31 +000015242int
15243PyUnicode_ClearFreeList(void)
15244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015245 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015246}
15247
Guido van Rossumd57fd912000-03-10 22:53:23 +000015248void
Thomas Wouters78890102000-07-22 19:25:51 +000015249_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015250{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015251 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015252
Serhiy Storchaka05997252013-01-26 12:14:02 +020015253 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015254
Serhiy Storchaka05997252013-01-26 12:14:02 +020015255 for (i = 0; i < 256; i++)
15256 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015257 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015258 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015259}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015260
Walter Dörwald16807132007-05-25 13:52:07 +000015261void
15262PyUnicode_InternInPlace(PyObject **p)
15263{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015264 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015266#ifdef Py_DEBUG
15267 assert(s != NULL);
15268 assert(_PyUnicode_CHECK(s));
15269#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015271 return;
15272#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 /* If it's a subclass, we don't really know what putting
15274 it in the interned dict might do. */
15275 if (!PyUnicode_CheckExact(s))
15276 return;
15277 if (PyUnicode_CHECK_INTERNED(s))
15278 return;
15279 if (interned == NULL) {
15280 interned = PyDict_New();
15281 if (interned == NULL) {
15282 PyErr_Clear(); /* Don't leave an exception */
15283 return;
15284 }
15285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015287 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015289 if (t == NULL) {
15290 PyErr_Clear();
15291 return;
15292 }
15293 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015294 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015295 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015296 return;
15297 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 /* The two references in interned are not counted by refcnt.
15299 The deallocator will take care of this */
15300 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015301 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015302}
15303
15304void
15305PyUnicode_InternImmortal(PyObject **p)
15306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 PyUnicode_InternInPlace(p);
15308 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015309 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 Py_INCREF(*p);
15311 }
Walter Dörwald16807132007-05-25 13:52:07 +000015312}
15313
15314PyObject *
15315PyUnicode_InternFromString(const char *cp)
15316{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 PyObject *s = PyUnicode_FromString(cp);
15318 if (s == NULL)
15319 return NULL;
15320 PyUnicode_InternInPlace(&s);
15321 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015322}
15323
Alexander Belopolsky40018472011-02-26 01:02:56 +000015324void
15325_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015328 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 Py_ssize_t i, n;
15330 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015331
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 if (interned == NULL || !PyDict_Check(interned))
15333 return;
15334 keys = PyDict_Keys(interned);
15335 if (keys == NULL || !PyList_Check(keys)) {
15336 PyErr_Clear();
15337 return;
15338 }
Walter Dörwald16807132007-05-25 13:52:07 +000015339
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15341 detector, interned unicode strings are not forcibly deallocated;
15342 rather, we give them their stolen references back, and then clear
15343 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015344
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 n = PyList_GET_SIZE(keys);
15346 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015347 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015349 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015350 if (PyUnicode_READY(s) == -1) {
15351 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015352 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015354 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 case SSTATE_NOT_INTERNED:
15356 /* XXX Shouldn't happen */
15357 break;
15358 case SSTATE_INTERNED_IMMORTAL:
15359 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015360 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 break;
15362 case SSTATE_INTERNED_MORTAL:
15363 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015364 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 break;
15366 default:
15367 Py_FatalError("Inconsistent interned string state.");
15368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 }
15371 fprintf(stderr, "total size of all interned strings: "
15372 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15373 "mortal/immortal\n", mortal_size, immortal_size);
15374 Py_DECREF(keys);
15375 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015376 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015377}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015378
15379
15380/********************* Unicode Iterator **************************/
15381
15382typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 PyObject_HEAD
15384 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015385 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386} unicodeiterobject;
15387
15388static void
15389unicodeiter_dealloc(unicodeiterobject *it)
15390{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 _PyObject_GC_UNTRACK(it);
15392 Py_XDECREF(it->it_seq);
15393 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015394}
15395
15396static int
15397unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15398{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 Py_VISIT(it->it_seq);
15400 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401}
15402
15403static PyObject *
15404unicodeiter_next(unicodeiterobject *it)
15405{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015406 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015407
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 assert(it != NULL);
15409 seq = it->it_seq;
15410 if (seq == NULL)
15411 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015412 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015414 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15415 int kind = PyUnicode_KIND(seq);
15416 void *data = PyUnicode_DATA(seq);
15417 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15418 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 if (item != NULL)
15420 ++it->it_index;
15421 return item;
15422 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015423
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015425 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427}
15428
15429static PyObject *
15430unicodeiter_len(unicodeiterobject *it)
15431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 Py_ssize_t len = 0;
15433 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015434 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015436}
15437
15438PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15439
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015440static PyObject *
15441unicodeiter_reduce(unicodeiterobject *it)
15442{
15443 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015444 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015445 it->it_seq, it->it_index);
15446 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015447 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448 if (u == NULL)
15449 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015450 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 }
15452}
15453
15454PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15455
15456static PyObject *
15457unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15458{
15459 Py_ssize_t index = PyLong_AsSsize_t(state);
15460 if (index == -1 && PyErr_Occurred())
15461 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015462 if (it->it_seq != NULL) {
15463 if (index < 0)
15464 index = 0;
15465 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15466 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15467 it->it_index = index;
15468 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015469 Py_RETURN_NONE;
15470}
15471
15472PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15473
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015474static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015475 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015476 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015477 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15478 reduce_doc},
15479 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15480 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015482};
15483
15484PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15486 "str_iterator", /* tp_name */
15487 sizeof(unicodeiterobject), /* tp_basicsize */
15488 0, /* tp_itemsize */
15489 /* methods */
15490 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15491 0, /* tp_print */
15492 0, /* tp_getattr */
15493 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015494 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 0, /* tp_repr */
15496 0, /* tp_as_number */
15497 0, /* tp_as_sequence */
15498 0, /* tp_as_mapping */
15499 0, /* tp_hash */
15500 0, /* tp_call */
15501 0, /* tp_str */
15502 PyObject_GenericGetAttr, /* tp_getattro */
15503 0, /* tp_setattro */
15504 0, /* tp_as_buffer */
15505 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15506 0, /* tp_doc */
15507 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15508 0, /* tp_clear */
15509 0, /* tp_richcompare */
15510 0, /* tp_weaklistoffset */
15511 PyObject_SelfIter, /* tp_iter */
15512 (iternextfunc)unicodeiter_next, /* tp_iternext */
15513 unicodeiter_methods, /* tp_methods */
15514 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015515};
15516
15517static PyObject *
15518unicode_iter(PyObject *seq)
15519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015521
Benjamin Peterson14339b62009-01-31 16:36:08 +000015522 if (!PyUnicode_Check(seq)) {
15523 PyErr_BadInternalCall();
15524 return NULL;
15525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015526 if (PyUnicode_READY(seq) == -1)
15527 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015528 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15529 if (it == NULL)
15530 return NULL;
15531 it->it_index = 0;
15532 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015533 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015534 _PyObject_GC_TRACK(it);
15535 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015536}
15537
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015538
15539size_t
15540Py_UNICODE_strlen(const Py_UNICODE *u)
15541{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015542 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015543}
15544
15545Py_UNICODE*
15546Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15547{
15548 Py_UNICODE *u = s1;
15549 while ((*u++ = *s2++));
15550 return s1;
15551}
15552
15553Py_UNICODE*
15554Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15555{
15556 Py_UNICODE *u = s1;
15557 while ((*u++ = *s2++))
15558 if (n-- == 0)
15559 break;
15560 return s1;
15561}
15562
15563Py_UNICODE*
15564Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15565{
15566 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015567 u1 += wcslen(u1);
15568 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015569 return s1;
15570}
15571
15572int
15573Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15574{
15575 while (*s1 && *s2 && *s1 == *s2)
15576 s1++, s2++;
15577 if (*s1 && *s2)
15578 return (*s1 < *s2) ? -1 : +1;
15579 if (*s1)
15580 return 1;
15581 if (*s2)
15582 return -1;
15583 return 0;
15584}
15585
15586int
15587Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15588{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015589 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015590 for (; n != 0; n--) {
15591 u1 = *s1;
15592 u2 = *s2;
15593 if (u1 != u2)
15594 return (u1 < u2) ? -1 : +1;
15595 if (u1 == '\0')
15596 return 0;
15597 s1++;
15598 s2++;
15599 }
15600 return 0;
15601}
15602
15603Py_UNICODE*
15604Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15605{
15606 const Py_UNICODE *p;
15607 for (p = s; *p; p++)
15608 if (*p == c)
15609 return (Py_UNICODE*)p;
15610 return NULL;
15611}
15612
15613Py_UNICODE*
15614Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15615{
15616 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015617 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015618 while (p != s) {
15619 p--;
15620 if (*p == c)
15621 return (Py_UNICODE*)p;
15622 }
15623 return NULL;
15624}
Victor Stinner331ea922010-08-10 16:37:20 +000015625
Victor Stinner71133ff2010-09-01 23:43:53 +000015626Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015627PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015628{
Victor Stinner577db2c2011-10-11 22:12:48 +020015629 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015630 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015632 if (!PyUnicode_Check(unicode)) {
15633 PyErr_BadArgument();
15634 return NULL;
15635 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015636 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015637 if (u == NULL)
15638 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015639 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015640 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015641 PyErr_NoMemory();
15642 return NULL;
15643 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015644 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015645 size *= sizeof(Py_UNICODE);
15646 copy = PyMem_Malloc(size);
15647 if (copy == NULL) {
15648 PyErr_NoMemory();
15649 return NULL;
15650 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015651 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015652 return copy;
15653}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015654
Georg Brandl66c221e2010-10-14 07:04:07 +000015655/* A _string module, to export formatter_parser and formatter_field_name_split
15656 to the string.Formatter class implemented in Python. */
15657
15658static PyMethodDef _string_methods[] = {
15659 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15660 METH_O, PyDoc_STR("split the argument as a field name")},
15661 {"formatter_parser", (PyCFunction) formatter_parser,
15662 METH_O, PyDoc_STR("parse the argument as a format string")},
15663 {NULL, NULL}
15664};
15665
15666static struct PyModuleDef _string_module = {
15667 PyModuleDef_HEAD_INIT,
15668 "_string",
15669 PyDoc_STR("string helper module"),
15670 0,
15671 _string_methods,
15672 NULL,
15673 NULL,
15674 NULL,
15675 NULL
15676};
15677
15678PyMODINIT_FUNC
15679PyInit__string(void)
15680{
15681 return PyModule_Create(&_string_module);
15682}
15683
15684
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015685#ifdef __cplusplus
15686}
15687#endif