blob: e396d68d1209f7382ab1cbc6fc322e1b401ebf0c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090052class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
55
56/*[python input]
57class Py_UCS4_converter(CConverter):
58 type = 'Py_UCS4'
59 converter = 'convert_uc'
60
61 def converter_init(self):
62 if self.default is not unspecified:
63 self.c_default = ascii(self.default)
64 if len(self.c_default) > 4 or self.c_default[0] != "'":
65 self.c_default = hex(ord(self.default))
66
67[python start generated code]*/
68/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080069
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000070/* --- Globals ------------------------------------------------------------
71
Serhiy Storchaka05997252013-01-26 12:14:02 +020072NOTE: In the interpreter's initialization phase, some globals are currently
73 initialized dynamically as needed. In the process Unicode objects may
74 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000075
76*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000078
79#ifdef __cplusplus
80extern "C" {
81#endif
82
Victor Stinner8faf8212011-12-08 22:14:11 +010083/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
84#define MAX_UNICODE 0x10ffff
85
Victor Stinner910337b2011-10-03 03:20:16 +020086#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020087# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020088#else
89# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
90#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020091
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092#define _PyUnicode_UTF8(op) \
93 (((PyCompactUnicodeObject*)(op))->utf8)
94#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020095 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020096 assert(PyUnicode_IS_READY(op)), \
97 PyUnicode_IS_COMPACT_ASCII(op) ? \
98 ((char*)((PyASCIIObject*)(op) + 1)) : \
99 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200100#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200101 (((PyCompactUnicodeObject*)(op))->utf8_length)
102#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200103 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104 assert(PyUnicode_IS_READY(op)), \
105 PyUnicode_IS_COMPACT_ASCII(op) ? \
106 ((PyASCIIObject*)(op))->length : \
107 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200108#define _PyUnicode_WSTR(op) \
109 (((PyASCIIObject*)(op))->wstr)
110#define _PyUnicode_WSTR_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->wstr_length)
112#define _PyUnicode_LENGTH(op) \
113 (((PyASCIIObject *)(op))->length)
114#define _PyUnicode_STATE(op) \
115 (((PyASCIIObject *)(op))->state)
116#define _PyUnicode_HASH(op) \
117 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200118#define _PyUnicode_KIND(op) \
119 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200120 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200121#define _PyUnicode_GET_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200124#define _PyUnicode_DATA_ANY(op) \
125 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200131 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100132 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200133
Victor Stinnerc379ead2011-10-03 12:52:27 +0200134#define _PyUnicode_SHARE_UTF8(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
137 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
138#define _PyUnicode_SHARE_WSTR(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
141
Victor Stinner829c0ad2011-10-03 01:08:02 +0200142/* true if the Unicode object has an allocated UTF-8 memory block
143 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200145 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner03490912011-10-03 23:45:12 +0200149/* true if the Unicode object has an allocated wstr memory block
150 (not shared with other data) */
151#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200152 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200153 (!PyUnicode_IS_READY(op) || \
154 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
155
Victor Stinner910337b2011-10-03 03:20:16 +0200156/* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (from_type *)(begin); \
165 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200175 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200178 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200179
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200180#ifdef MS_WINDOWS
181 /* On Windows, overallocate by 50% is the best factor */
182# define OVERALLOCATE_FACTOR 2
183#else
184 /* On Linux, overallocate by 25% is the best factor */
185# define OVERALLOCATE_FACTOR 4
186#endif
187
Walter Dörwald16807132007-05-25 13:52:07 +0000188/* This dictionary holds all interned unicode strings. Note that references
189 to strings in this dictionary are *not* counted in the string's ob_refcnt.
190 When the interned string reaches a refcnt of 0 the string deallocation
191 function will delete the reference from this dictionary.
192
193 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000194 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000195*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200196static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200199static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200202 do { \
203 if (unicode_empty != NULL) \
204 Py_INCREF(unicode_empty); \
205 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206 unicode_empty = PyUnicode_New(0, 0); \
207 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
210 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_RETURN_UNICODE_EMPTY() \
215 do { \
216 _Py_INCREF_UNICODE_EMPTY(); \
217 return unicode_empty; \
218 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200220/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700221static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200222_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200224/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000227/* Single character Unicode strings in the Latin-1 range are being
228 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Christian Heimes190d79e2008-01-30 11:58:22 +0000231/* Fast detection of the most frequent whitespace characters */
232const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000236/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000237/* case 0x000C: * FORM FEED */
238/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 1, 1, 1, 1, 1, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000241/* case 0x001C: * FILE SEPARATOR */
242/* case 0x001D: * GROUP SEPARATOR */
243/* case 0x001E: * RECORD SEPARATOR */
244/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 1, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000251
Benjamin Peterson14339b62009-01-31 16:36:08 +0000252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000260};
261
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200262/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200263static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200264static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100265static int unicode_modifiable(PyObject *unicode);
266
Victor Stinnerfe226c02011-10-03 03:52:20 +0200267
Alexander Belopolsky40018472011-02-26 01:02:56 +0000268static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100269_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200270static PyObject *
271_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
272static PyObject *
273_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
274
275static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000276unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000277 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100278 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000279 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
280
Alexander Belopolsky40018472011-02-26 01:02:56 +0000281static void
282raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300283 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100284 PyObject *unicode,
285 Py_ssize_t startpos, Py_ssize_t endpos,
286 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200289static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* 0x000B, * LINE TABULATION */
293/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* 0x001C, * FILE SEPARATOR */
298/* 0x001D, * GROUP SEPARATOR */
299/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
INADA Naoki3ae20562017-01-16 20:41:20 +0900316static int convert_uc(PyObject *obj, void *addr);
317
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300318#include "clinic/unicodeobject.c.h"
319
Victor Stinner50149202015-09-22 00:26:54 +0200320typedef enum {
321 _Py_ERROR_UNKNOWN=0,
322 _Py_ERROR_STRICT,
323 _Py_ERROR_SURROGATEESCAPE,
324 _Py_ERROR_REPLACE,
325 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200326 _Py_ERROR_BACKSLASHREPLACE,
327 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200328 _Py_ERROR_XMLCHARREFREPLACE,
329 _Py_ERROR_OTHER
330} _Py_error_handler;
331
332static _Py_error_handler
333get_error_handler(const char *errors)
334{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200336 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200337 }
338 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200339 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200340 }
341 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200342 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200343 }
344 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200345 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200346 }
347 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200348 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200349 }
350 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200351 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200352 }
353 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200354 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200355 }
Victor Stinner50149202015-09-22 00:26:54 +0200356 return _Py_ERROR_OTHER;
357}
358
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300359/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
360 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000361Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000362PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000363{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000364#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 /* This is actually an illegal character, so it should
368 not be passed to unichr. */
369 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000370#endif
371}
372
Victor Stinner910337b2011-10-03 03:20:16 +0200373#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200374int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100375_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200376{
377 PyASCIIObject *ascii;
378 unsigned int kind;
379
380 assert(PyUnicode_Check(op));
381
382 ascii = (PyASCIIObject *)op;
383 kind = ascii->state.kind;
384
Victor Stinnera3b334d2011-10-03 13:53:37 +0200385 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200386 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(ascii->state.ready == 1);
388 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200390 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200391 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200392
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 if (ascii->state.compact == 1) {
394 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200399 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 }
402 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200403 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
404
405 data = unicode->data.any;
406 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100407 assert(ascii->length == 0);
408 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200409 assert(ascii->state.compact == 0);
410 assert(ascii->state.ascii == 0);
411 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100412 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200413 assert(ascii->wstr != NULL);
414 assert(data == NULL);
415 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200416 }
417 else {
418 assert(kind == PyUnicode_1BYTE_KIND
419 || kind == PyUnicode_2BYTE_KIND
420 || kind == PyUnicode_4BYTE_KIND);
421 assert(ascii->state.compact == 0);
422 assert(ascii->state.ready == 1);
423 assert(data != NULL);
424 if (ascii->state.ascii) {
425 assert (compact->utf8 == data);
426 assert (compact->utf8_length == ascii->length);
427 }
428 else
429 assert (compact->utf8 != data);
430 }
431 }
432 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200433 if (
434#if SIZEOF_WCHAR_T == 2
435 kind == PyUnicode_2BYTE_KIND
436#else
437 kind == PyUnicode_4BYTE_KIND
438#endif
439 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 {
441 assert(ascii->wstr == data);
442 assert(compact->wstr_length == ascii->length);
443 } else
444 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200445 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200446
447 if (compact->utf8 == NULL)
448 assert(compact->utf8_length == 0);
449 if (ascii->wstr == NULL)
450 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200451 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 /* check that the best kind is used */
453 if (check_content && kind != PyUnicode_WCHAR_KIND)
454 {
455 Py_ssize_t i;
456 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200457 void *data;
458 Py_UCS4 ch;
459
460 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 for (i=0; i < ascii->length; i++)
462 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200463 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 if (ch > maxchar)
465 maxchar = ch;
466 }
467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100468 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100470 assert(maxchar <= 255);
471 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200472 else
473 assert(maxchar < 128);
474 }
Victor Stinner77faf692011-11-20 18:56:05 +0100475 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200476 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100477 assert(maxchar <= 0xFFFF);
478 }
479 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200480 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100481 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100482 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200483 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200484 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400485 return 1;
486}
Victor Stinner910337b2011-10-03 03:20:16 +0200487#endif
488
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489static PyObject*
490unicode_result_wchar(PyObject *unicode)
491{
492#ifndef Py_DEBUG
493 Py_ssize_t len;
494
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100495 len = _PyUnicode_WSTR_LENGTH(unicode);
496 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100499 }
500
501 if (len == 1) {
502 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100503 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100504 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
505 Py_DECREF(unicode);
506 return latin1_char;
507 }
508 }
509
510 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200511 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100512 return NULL;
513 }
514#else
Victor Stinneraa771272012-10-04 02:32:58 +0200515 assert(Py_REFCNT(unicode) == 1);
516
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100517 /* don't make the result ready in debug mode to ensure that the caller
518 makes the string ready before using it */
519 assert(_PyUnicode_CheckConsistency(unicode, 1));
520#endif
521 return unicode;
522}
523
524static PyObject*
525unicode_result_ready(PyObject *unicode)
526{
527 Py_ssize_t length;
528
529 length = PyUnicode_GET_LENGTH(unicode);
530 if (length == 0) {
531 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100532 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100534 }
535 return unicode_empty;
536 }
537
538 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200539 void *data = PyUnicode_DATA(unicode);
540 int kind = PyUnicode_KIND(unicode);
541 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100542 if (ch < 256) {
543 PyObject *latin1_char = unicode_latin1[ch];
544 if (latin1_char != NULL) {
545 if (unicode != latin1_char) {
546 Py_INCREF(latin1_char);
547 Py_DECREF(unicode);
548 }
549 return latin1_char;
550 }
551 else {
552 assert(_PyUnicode_CheckConsistency(unicode, 1));
553 Py_INCREF(unicode);
554 unicode_latin1[ch] = unicode;
555 return unicode;
556 }
557 }
558 }
559
560 assert(_PyUnicode_CheckConsistency(unicode, 1));
561 return unicode;
562}
563
564static PyObject*
565unicode_result(PyObject *unicode)
566{
567 assert(_PyUnicode_CHECK(unicode));
568 if (PyUnicode_IS_READY(unicode))
569 return unicode_result_ready(unicode);
570 else
571 return unicode_result_wchar(unicode);
572}
573
Victor Stinnerc4b49542011-12-11 22:44:26 +0100574static PyObject*
575unicode_result_unchanged(PyObject *unicode)
576{
577 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500578 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579 return NULL;
580 Py_INCREF(unicode);
581 return unicode;
582 }
583 else
584 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100585 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100586}
587
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
589 ASCII, Latin1, UTF-8, etc. */
590static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200591backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200592 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
593{
Victor Stinnerad771582015-10-09 12:38:53 +0200594 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200595 Py_UCS4 ch;
596 enum PyUnicode_Kind kind;
597 void *data;
598
599 assert(PyUnicode_IS_READY(unicode));
600 kind = PyUnicode_KIND(unicode);
601 data = PyUnicode_DATA(unicode);
602
603 size = 0;
604 /* determine replacement size */
605 for (i = collstart; i < collend; ++i) {
606 Py_ssize_t incr;
607
608 ch = PyUnicode_READ(kind, data, i);
609 if (ch < 0x100)
610 incr = 2+2;
611 else if (ch < 0x10000)
612 incr = 2+4;
613 else {
614 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200615 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200616 }
617 if (size > PY_SSIZE_T_MAX - incr) {
618 PyErr_SetString(PyExc_OverflowError,
619 "encoded result is too long for a Python string");
620 return NULL;
621 }
622 size += incr;
623 }
624
Victor Stinnerad771582015-10-09 12:38:53 +0200625 str = _PyBytesWriter_Prepare(writer, str, size);
626 if (str == NULL)
627 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628
629 /* generate replacement */
630 for (i = collstart; i < collend; ++i) {
631 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200632 *str++ = '\\';
633 if (ch >= 0x00010000) {
634 *str++ = 'U';
635 *str++ = Py_hexdigits[(ch>>28)&0xf];
636 *str++ = Py_hexdigits[(ch>>24)&0xf];
637 *str++ = Py_hexdigits[(ch>>20)&0xf];
638 *str++ = Py_hexdigits[(ch>>16)&0xf];
639 *str++ = Py_hexdigits[(ch>>12)&0xf];
640 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200641 }
Victor Stinner797485e2015-10-09 03:17:30 +0200642 else if (ch >= 0x100) {
643 *str++ = 'u';
644 *str++ = Py_hexdigits[(ch>>12)&0xf];
645 *str++ = Py_hexdigits[(ch>>8)&0xf];
646 }
647 else
648 *str++ = 'x';
649 *str++ = Py_hexdigits[(ch>>4)&0xf];
650 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 }
652 return str;
653}
654
655/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
656 ASCII, Latin1, UTF-8, etc. */
657static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200658xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200659 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
660{
Victor Stinnerad771582015-10-09 12:38:53 +0200661 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 Py_UCS4 ch;
663 enum PyUnicode_Kind kind;
664 void *data;
665
666 assert(PyUnicode_IS_READY(unicode));
667 kind = PyUnicode_KIND(unicode);
668 data = PyUnicode_DATA(unicode);
669
670 size = 0;
671 /* determine replacement size */
672 for (i = collstart; i < collend; ++i) {
673 Py_ssize_t incr;
674
675 ch = PyUnicode_READ(kind, data, i);
676 if (ch < 10)
677 incr = 2+1+1;
678 else if (ch < 100)
679 incr = 2+2+1;
680 else if (ch < 1000)
681 incr = 2+3+1;
682 else if (ch < 10000)
683 incr = 2+4+1;
684 else if (ch < 100000)
685 incr = 2+5+1;
686 else if (ch < 1000000)
687 incr = 2+6+1;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+7+1;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
Victor Stinnerad771582015-10-09 12:38:53 +0200700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
707 }
708 return str;
709}
710
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711/* --- Bloom Filters ----------------------------------------------------- */
712
713/* stuff to implement simple "bloom filters" for Unicode characters.
714 to keep things simple, we use a single bitmask, using the least 5
715 bits from each unicode characters as the bit index. */
716
717/* the linebreak mask is set up by Unicode_Init below */
718
Antoine Pitrouf068f942010-01-13 14:19:12 +0000719#if LONG_BIT >= 128
720#define BLOOM_WIDTH 128
721#elif LONG_BIT >= 64
722#define BLOOM_WIDTH 64
723#elif LONG_BIT >= 32
724#define BLOOM_WIDTH 32
725#else
726#error "LONG_BIT is smaller than 32"
727#endif
728
Thomas Wouters477c8d52006-05-27 19:21:47 +0000729#define BLOOM_MASK unsigned long
730
Serhiy Storchaka05997252013-01-26 12:14:02 +0200731static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
Benjamin Peterson29060642009-01-31 22:14:21 +0000735#define BLOOM_LINEBREAK(ch) \
736 ((ch) < 128U ? ascii_linebreak[(ch)] : \
737 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700739static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741{
Victor Stinnera85af502013-04-09 21:53:54 +0200742#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
743 do { \
744 TYPE *data = (TYPE *)PTR; \
745 TYPE *end = data + LEN; \
746 Py_UCS4 ch; \
747 for (; data != end; data++) { \
748 ch = *data; \
749 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
750 } \
751 break; \
752 } while (0)
753
Thomas Wouters477c8d52006-05-27 19:21:47 +0000754 /* calculate simple bloom-style bitmask for a given unicode string */
755
Antoine Pitrouf068f942010-01-13 14:19:12 +0000756 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757
758 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200759 switch (kind) {
760 case PyUnicode_1BYTE_KIND:
761 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
762 break;
763 case PyUnicode_2BYTE_KIND:
764 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
765 break;
766 case PyUnicode_4BYTE_KIND:
767 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
768 break;
769 default:
770 assert(0);
771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200773
774#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775}
776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777static int
778ensure_unicode(PyObject *obj)
779{
780 if (!PyUnicode_Check(obj)) {
781 PyErr_Format(PyExc_TypeError,
782 "must be str, not %.100s",
783 Py_TYPE(obj)->tp_name);
784 return -1;
785 }
786 return PyUnicode_READY(obj);
787}
788
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200789/* Compilation of templated routines */
790
791#include "stringlib/asciilib.h"
792#include "stringlib/fastsearch.h"
793#include "stringlib/partition.h"
794#include "stringlib/split.h"
795#include "stringlib/count.h"
796#include "stringlib/find.h"
797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs1lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs2lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
823#include "stringlib/ucs4lib.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/partition.h"
826#include "stringlib/split.h"
827#include "stringlib/count.h"
828#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300829#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200830#include "stringlib/find_max_char.h"
831#include "stringlib/localeutil.h"
832#include "stringlib/undef.h"
833
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200834#include "stringlib/unicodedefs.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/count.h"
837#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100838#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* --- Unicode Object ----------------------------------------------------- */
841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200843fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700845static inline Py_ssize_t
846findchar(const void *s, int kind,
847 Py_ssize_t size, Py_UCS4 ch,
848 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 switch (kind) {
851 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if ((Py_UCS1) ch != ch)
853 return -1;
854 if (direction > 0)
855 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
856 else
857 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200858 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200859 if ((Py_UCS2) ch != ch)
860 return -1;
861 if (direction > 0)
862 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
863 else
864 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200865 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200866 if (direction > 0)
867 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
868 else
869 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200870 default:
871 assert(0);
872 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
1522 assert(0);
1523 return -1;
1524 }
1525 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001526 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001527 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 Py_ssize_t i;
1530
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 for (i=0; i < how_many; i++) {
1532 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (ch > to_maxchar)
1534 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001535 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1536 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 }
1538 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 return 0;
1540}
1541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542void
1543_PyUnicode_FastCopyCharacters(
1544 PyObject *to, Py_ssize_t to_start,
1545 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546{
1547 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1548}
1549
1550Py_ssize_t
1551PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many)
1554{
1555 int err;
1556
1557 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561
Benjamin Petersonbac79492012-01-14 13:34:47 -05001562 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001564 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 return -1;
1566
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001567 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001568 PyErr_SetString(PyExc_IndexError, "string index out of range");
1569 return -1;
1570 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001571 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001572 PyErr_SetString(PyExc_IndexError, "string index out of range");
1573 return -1;
1574 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001575 if (how_many < 0) {
1576 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1577 return -1;
1578 }
1579 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1581 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001582 "Cannot write %zi characters at %zi "
1583 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 how_many, to_start, PyUnicode_GET_LENGTH(to));
1585 return -1;
1586 }
1587
1588 if (how_many == 0)
1589 return 0;
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 return -1;
1593
1594 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1595 if (err) {
1596 PyErr_Format(PyExc_SystemError,
1597 "Cannot copy %s characters "
1598 "into a string of %s characters",
1599 unicode_kind_name(from),
1600 unicode_kind_name(to));
1601 return -1;
1602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604}
1605
Victor Stinner17222162011-09-28 22:15:37 +02001606/* Find the maximum code point and count the number of surrogate pairs so a
1607 correct string length can be computed before converting a string to UCS4.
1608 This function counts single surrogates as a character and not as a pair.
1609
1610 Return 0 on success, or -1 on error. */
1611static int
1612find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1613 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614{
1615 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001616 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617
Victor Stinnerc53be962011-10-02 21:33:54 +02001618 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 *num_surrogates = 0;
1620 *maxchar = 0;
1621
1622 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001623#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001624 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1625 && (iter+1) < end
1626 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1627 {
1628 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1629 ++(*num_surrogates);
1630 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001634 {
1635 ch = *iter;
1636 iter++;
1637 }
1638 if (ch > *maxchar) {
1639 *maxchar = ch;
1640 if (*maxchar > MAX_UNICODE) {
1641 PyErr_Format(PyExc_ValueError,
1642 "character U+%x is not in range [U+0000; U+10ffff]",
1643 ch);
1644 return -1;
1645 }
1646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 }
1648 return 0;
1649}
1650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001651int
1652_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653{
1654 wchar_t *end;
1655 Py_UCS4 maxchar = 0;
1656 Py_ssize_t num_surrogates;
1657#if SIZEOF_WCHAR_T == 2
1658 Py_ssize_t length_wo_surrogates;
1659#endif
1660
Georg Brandl7597add2011-10-05 16:36:47 +02001661 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001662 strings were created using _PyObject_New() and where no canonical
1663 representation (the str field) has been set yet aka strings
1664 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001665 assert(_PyUnicode_CHECK(unicode));
1666 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001668 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001669 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 /* Actually, it should neither be interned nor be anything else: */
1671 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001674 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677
1678 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001679 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1680 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyErr_NoMemory();
1682 return -1;
1683 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 _PyUnicode_WSTR(unicode), end,
1686 PyUnicode_1BYTE_DATA(unicode));
1687 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1688 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1689 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1690 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001691 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001693 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 }
1695 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001696 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001697 _PyUnicode_UTF8(unicode) = NULL;
1698 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 PyObject_FREE(_PyUnicode_WSTR(unicode));
1701 _PyUnicode_WSTR(unicode) = NULL;
1702 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1703 }
1704 /* In this case we might have to convert down from 4-byte native
1705 wchar_t to 2-byte unicode. */
1706 else if (maxchar < 65536) {
1707 assert(num_surrogates == 0 &&
1708 "FindMaxCharAndNumSurrogatePairs() messed up");
1709
Victor Stinner506f5922011-09-28 22:34:18 +02001710#if SIZEOF_WCHAR_T == 2
1711 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718#else
1719 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001720 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001721 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001723 PyErr_NoMemory();
1724 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 }
Victor Stinner506f5922011-09-28 22:34:18 +02001726 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1727 _PyUnicode_WSTR(unicode), end,
1728 PyUnicode_2BYTE_DATA(unicode));
1729 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1730 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1731 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001734 PyObject_FREE(_PyUnicode_WSTR(unicode));
1735 _PyUnicode_WSTR(unicode) = NULL;
1736 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1737#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 }
1739 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1740 else {
1741#if SIZEOF_WCHAR_T == 2
1742 /* in case the native representation is 2-bytes, we need to allocate a
1743 new normalized 4-byte version. */
1744 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001745 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1746 PyErr_NoMemory();
1747 return -1;
1748 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001749 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1750 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 PyErr_NoMemory();
1752 return -1;
1753 }
1754 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001756 _PyUnicode_UTF8(unicode) = NULL;
1757 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001758 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 PyObject_FREE(_PyUnicode_WSTR(unicode));
1762 _PyUnicode_WSTR(unicode) = NULL;
1763 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1764#else
1765 assert(num_surrogates == 0);
1766
Victor Stinnerc3c74152011-10-02 20:39:55 +02001767 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001769 _PyUnicode_UTF8(unicode) = NULL;
1770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1772#endif
1773 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1774 }
1775 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001776 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 return 0;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001781unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald16807132007-05-25 13:52:07 +00001783 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 case SSTATE_NOT_INTERNED:
1785 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 case SSTATE_INTERNED_MORTAL:
1788 /* revive dead object temporarily for DelItem */
1789 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001790 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 Py_FatalError(
1792 "deletion of interned string failed");
1793 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001794
Benjamin Peterson29060642009-01-31 22:14:21 +00001795 case SSTATE_INTERNED_IMMORTAL:
1796 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
2080 assert(0 && "Impossible state");
2081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
2173 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002175 }
2176}
2177
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002178static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002179align_maxchar(Py_UCS4 maxchar)
2180{
2181 if (maxchar <= 127)
2182 return 127;
2183 else if (maxchar <= 255)
2184 return 255;
2185 else if (maxchar <= 65535)
2186 return 65535;
2187 else
2188 return MAX_UNICODE;
2189}
2190
Victor Stinner702c7342011-10-05 13:50:52 +02002191static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002192_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196
Serhiy Storchaka678db842013-01-26 12:16:36 +02002197 if (size == 0)
2198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002199 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002200 if (size == 1)
2201 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002202
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002203 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002204 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002208 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002210}
2211
Victor Stinnere57b1c02011-09-28 22:20:48 +02002212static PyObject*
2213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214{
2215 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002217
Serhiy Storchaka678db842013-01-26 12:16:36 +02002218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002221 if (size == 1)
2222 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002223
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002224 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002225 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (!res)
2227 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002234 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 return res;
2236}
2237
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238static PyObject*
2239_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240{
2241 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002242 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243
Serhiy Storchaka678db842013-01-26 12:16:36 +02002244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002247 if (size == 1)
2248 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002250 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!res)
2253 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002262 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return res;
2264}
2265
2266PyObject*
2267PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002275 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284}
2285
Victor Stinnerece58de2012-04-23 23:36:38 +02002286Py_UCS4
2287_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288{
2289 enum PyUnicode_Kind kind;
2290 void *startptr, *endptr;
2291
2292 assert(PyUnicode_IS_READY(unicode));
2293 assert(0 <= start);
2294 assert(end <= PyUnicode_GET_LENGTH(unicode));
2295 assert(start <= end);
2296
2297 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2298 return PyUnicode_MAX_CHAR_VALUE(unicode);
2299
2300 if (start == end)
2301 return 127;
2302
Victor Stinner94d558b2012-04-27 22:26:58 +02002303 if (PyUnicode_IS_ASCII(unicode))
2304 return 127;
2305
Victor Stinnerece58de2012-04-23 23:36:38 +02002306 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002307 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002308 endptr = (char *)startptr + end * kind;
2309 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002310 switch(kind) {
2311 case PyUnicode_1BYTE_KIND:
2312 return ucs1lib_find_max_char(startptr, endptr);
2313 case PyUnicode_2BYTE_KIND:
2314 return ucs2lib_find_max_char(startptr, endptr);
2315 case PyUnicode_4BYTE_KIND:
2316 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002317 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002318 assert(0);
2319 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002320 }
2321}
2322
Victor Stinner25a4b292011-10-06 12:31:55 +02002323/* Ensure that a string uses the most efficient storage, if it is not the
2324 case: create a new string with of the right kind. Write NULL into *p_unicode
2325 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002326static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002327unicode_adjust_maxchar(PyObject **p_unicode)
2328{
2329 PyObject *unicode, *copy;
2330 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002332 unsigned int kind;
2333
2334 assert(p_unicode != NULL);
2335 unicode = *p_unicode;
2336 assert(PyUnicode_IS_READY(unicode));
2337 if (PyUnicode_IS_ASCII(unicode))
2338 return;
2339
2340 len = PyUnicode_GET_LENGTH(unicode);
2341 kind = PyUnicode_KIND(unicode);
2342 if (kind == PyUnicode_1BYTE_KIND) {
2343 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 max_char = ucs1lib_find_max_char(u, u + len);
2345 if (max_char >= 128)
2346 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002347 }
2348 else if (kind == PyUnicode_2BYTE_KIND) {
2349 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs2lib_find_max_char(u, u + len);
2351 if (max_char >= 256)
2352 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002353 }
2354 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002356 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs4lib_find_max_char(u, u + len);
2358 if (max_char >= 0x10000)
2359 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002360 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002361 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002362 if (copy != NULL)
2363 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 Py_DECREF(unicode);
2365 *p_unicode = copy;
2366}
2367
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002369_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370{
Victor Stinner87af4f22011-11-21 23:03:47 +01002371 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002372 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002373
Victor Stinner034f6cf2011-09-30 02:26:44 +02002374 if (!PyUnicode_Check(unicode)) {
2375 PyErr_BadInternalCall();
2376 return NULL;
2377 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002380
Victor Stinner87af4f22011-11-21 23:03:47 +01002381 length = PyUnicode_GET_LENGTH(unicode);
2382 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 if (!copy)
2384 return NULL;
2385 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
Christian Heimesf051e432016-09-13 20:22:02 +02002387 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002388 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002390 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394/* Widen Unicode objects to larger buffers. Don't write terminating null
2395 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396
2397void*
2398_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2399{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002400 Py_ssize_t len;
2401 void *result;
2402 unsigned int skind;
2403
Benjamin Petersonbac79492012-01-14 13:34:47 -05002404 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 return NULL;
2406
2407 len = PyUnicode_GET_LENGTH(s);
2408 skind = PyUnicode_KIND(s);
2409 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002410 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return NULL;
2412 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002413 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002415 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 if (!result)
2417 return PyErr_NoMemory();
2418 assert(skind == PyUnicode_1BYTE_KIND);
2419 _PyUnicode_CONVERT_BYTES(
2420 Py_UCS1, Py_UCS2,
2421 PyUnicode_1BYTE_DATA(s),
2422 PyUnicode_1BYTE_DATA(s) + len,
2423 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 if (skind == PyUnicode_2BYTE_KIND) {
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS2, Py_UCS4,
2432 PyUnicode_2BYTE_DATA(s),
2433 PyUnicode_2BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 else {
2437 assert(skind == PyUnicode_1BYTE_KIND);
2438 _PyUnicode_CONVERT_BYTES(
2439 Py_UCS1, Py_UCS4,
2440 PyUnicode_1BYTE_DATA(s),
2441 PyUnicode_1BYTE_DATA(s) + len,
2442 result);
2443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002445 default:
2446 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 }
Victor Stinner01698042011-10-04 00:04:26 +02002448 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return NULL;
2450}
2451
2452static Py_UCS4*
2453as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2454 int copy_null)
2455{
2456 int kind;
2457 void *data;
2458 Py_ssize_t len, targetlen;
2459 if (PyUnicode_READY(string) == -1)
2460 return NULL;
2461 kind = PyUnicode_KIND(string);
2462 data = PyUnicode_DATA(string);
2463 len = PyUnicode_GET_LENGTH(string);
2464 targetlen = len;
2465 if (copy_null)
2466 targetlen++;
2467 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002468 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!target) {
2470 PyErr_NoMemory();
2471 return NULL;
2472 }
2473 }
2474 else {
2475 if (targetsize < targetlen) {
2476 PyErr_Format(PyExc_SystemError,
2477 "string is longer than the buffer");
2478 if (copy_null && 0 < targetsize)
2479 target[0] = 0;
2480 return NULL;
2481 }
2482 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002483 if (kind == PyUnicode_1BYTE_KIND) {
2484 Py_UCS1 *start = (Py_UCS1 *) data;
2485 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002487 else if (kind == PyUnicode_2BYTE_KIND) {
2488 Py_UCS2 *start = (Py_UCS2 *) data;
2489 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2490 }
2491 else {
2492 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002493 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 if (copy_null)
2496 target[len] = 0;
2497 return target;
2498}
2499
2500Py_UCS4*
2501PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002504 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 PyErr_BadInternalCall();
2506 return NULL;
2507 }
2508 return as_ucs4(string, target, targetsize, copy_null);
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4Copy(PyObject *string)
2513{
2514 return as_ucs4(string, NULL, 0, 1);
2515}
2516
Victor Stinner15a11362012-10-06 23:48:20 +02002517/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002518 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2519 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2520#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002521
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522static int
2523unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2524 Py_ssize_t width, Py_ssize_t precision)
2525{
2526 Py_ssize_t length, fill, arglen;
2527 Py_UCS4 maxchar;
2528
2529 if (PyUnicode_READY(str) == -1)
2530 return -1;
2531
2532 length = PyUnicode_GET_LENGTH(str);
2533 if ((precision == -1 || precision >= length)
2534 && width <= length)
2535 return _PyUnicodeWriter_WriteStr(writer, str);
2536
2537 if (precision != -1)
2538 length = Py_MIN(precision, length);
2539
2540 arglen = Py_MAX(length, width);
2541 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2542 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2543 else
2544 maxchar = writer->maxchar;
2545
2546 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2547 return -1;
2548
2549 if (width > length) {
2550 fill = width - length;
2551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2552 return -1;
2553 writer->pos += fill;
2554 }
2555
2556 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2557 str, 0, length);
2558 writer->pos += length;
2559 return 0;
2560}
2561
2562static int
2563unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2564 Py_ssize_t width, Py_ssize_t precision)
2565{
2566 /* UTF-8 */
2567 Py_ssize_t length;
2568 PyObject *unicode;
2569 int res;
2570
2571 length = strlen(str);
2572 if (precision != -1)
2573 length = Py_MIN(length, precision);
2574 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2575 if (unicode == NULL)
2576 return -1;
2577
2578 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2579 Py_DECREF(unicode);
2580 return res;
2581}
2582
Victor Stinner96865452011-03-01 23:44:09 +00002583static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002584unicode_fromformat_arg(_PyUnicodeWriter *writer,
2585 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002586{
Victor Stinnere215d962012-10-06 23:03:36 +02002587 const char *p;
2588 Py_ssize_t len;
2589 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 Py_ssize_t width;
2591 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002592 int longflag;
2593 int longlongflag;
2594 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002596
2597 p = f;
2598 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002599 zeropad = 0;
2600 if (*f == '0') {
2601 zeropad = 1;
2602 f++;
2603 }
Victor Stinner96865452011-03-01 23:44:09 +00002604
2605 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 width = -1;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002609 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002610 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002612 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002614 return NULL;
2615 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002616 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002617 f++;
2618 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 }
2620 precision = -1;
2621 if (*f == '.') {
2622 f++;
2623 if (Py_ISDIGIT((unsigned)*f)) {
2624 precision = (*f - '0');
2625 f++;
2626 while (Py_ISDIGIT((unsigned)*f)) {
2627 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2628 PyErr_SetString(PyExc_ValueError,
2629 "precision too big");
2630 return NULL;
2631 }
2632 precision = (precision * 10) + (*f - '0');
2633 f++;
2634 }
2635 }
Victor Stinner96865452011-03-01 23:44:09 +00002636 if (*f == '%') {
2637 /* "%.3%s" => f points to "3" */
2638 f--;
2639 }
2640 }
2641 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002643 f--;
2644 }
Victor Stinner96865452011-03-01 23:44:09 +00002645
2646 /* Handle %ld, %lu, %lld and %llu. */
2647 longflag = 0;
2648 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002649 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002650 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002651 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002652 longflag = 1;
2653 ++f;
2654 }
Victor Stinner96865452011-03-01 23:44:09 +00002655 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002656 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002657 longlongflag = 1;
2658 f += 2;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660 }
2661 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002663 size_tflag = 1;
2664 ++f;
2665 }
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (f[1] == '\0')
2668 writer->overallocate = 0;
2669
2670 switch (*f) {
2671 case 'c':
2672 {
2673 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002675 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 "character argument not in range(0x110000)");
2677 return NULL;
2678 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002679 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002681 break;
2682 }
2683
2684 case 'i':
2685 case 'd':
2686 case 'u':
2687 case 'x':
2688 {
2689 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002690 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002692
2693 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002694 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002698 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002699 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002700 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002701 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002702 va_arg(*vargs, size_t));
2703 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, unsigned int));
2706 }
2707 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
2710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002714 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002715 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002716 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002717 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002719 va_arg(*vargs, Py_ssize_t));
2720 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002721 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_arg(*vargs, int));
2723 }
2724 assert(len >= 0);
2725
Victor Stinnere215d962012-10-06 23:03:36 +02002726 if (precision < len)
2727 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728
2729 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2731 return NULL;
2732
Victor Stinnere215d962012-10-06 23:03:36 +02002733 if (width > precision) {
2734 Py_UCS4 fillchar;
2735 fill = width - precision;
2736 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002737 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2738 return NULL;
2739 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 }
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2744 return NULL;
2745 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002746 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747
Victor Stinner4a587072013-11-19 12:54:53 +01002748 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2749 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002750 break;
2751 }
2752
2753 case 'p':
2754 {
2755 char number[MAX_LONG_LONG_CHARS];
2756
2757 len = sprintf(number, "%p", va_arg(*vargs, void*));
2758 assert(len >= 0);
2759
2760 /* %p is ill-defined: ensure leading 0x. */
2761 if (number[1] == 'X')
2762 number[1] = 'x';
2763 else if (number[1] != 'x') {
2764 memmove(number + 2, number,
2765 strlen(number) + 1);
2766 number[0] = '0';
2767 number[1] = 'x';
2768 len += 2;
2769 }
2770
Victor Stinner4a587072013-11-19 12:54:53 +01002771 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 break;
2774 }
2775
2776 case 's':
2777 {
2778 /* UTF-8 */
2779 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 break;
2783 }
2784
2785 case 'U':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 assert(obj && _PyUnicode_CHECK(obj));
2789
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
2792 break;
2793 }
2794
2795 case 'V':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (obj) {
2800 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 }
2804 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 assert(str != NULL);
2806 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002807 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002808 }
2809 break;
2810 }
2811
2812 case 'S':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 PyObject *str;
2816 assert(obj);
2817 str = PyObject_Str(obj);
2818 if (!str)
2819 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002820 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 Py_DECREF(str);
2822 return NULL;
2823 }
2824 Py_DECREF(str);
2825 break;
2826 }
2827
2828 case 'R':
2829 {
2830 PyObject *obj = va_arg(*vargs, PyObject *);
2831 PyObject *repr;
2832 assert(obj);
2833 repr = PyObject_Repr(obj);
2834 if (!repr)
2835 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002836 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 Py_DECREF(repr);
2838 return NULL;
2839 }
2840 Py_DECREF(repr);
2841 break;
2842 }
2843
2844 case 'A':
2845 {
2846 PyObject *obj = va_arg(*vargs, PyObject *);
2847 PyObject *ascii;
2848 assert(obj);
2849 ascii = PyObject_ASCII(obj);
2850 if (!ascii)
2851 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 Py_DECREF(ascii);
2854 return NULL;
2855 }
2856 Py_DECREF(ascii);
2857 break;
2858 }
2859
2860 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002861 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864
2865 default:
2866 /* if we stumble upon an unknown formatting code, copy the rest
2867 of the format string to the output string. (we cannot just
2868 skip the code, since there's no way to know what's in the
2869 argument list) */
2870 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002871 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002872 return NULL;
2873 f = p+len;
2874 return f;
2875 }
2876
2877 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002878 return f;
2879}
2880
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881PyObject *
2882PyUnicode_FromFormatV(const char *format, va_list vargs)
2883{
Victor Stinnere215d962012-10-06 23:03:36 +02002884 va_list vargs2;
2885 const char *f;
2886 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887
Victor Stinner8f674cc2013-04-17 23:02:17 +02002888 _PyUnicodeWriter_Init(&writer);
2889 writer.min_length = strlen(format) + 100;
2890 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002891
Benjamin Peterson0c212142016-09-20 20:39:33 -07002892 // Copy varags to be able to pass a reference to a subfunction.
2893 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002894
2895 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 f = unicode_fromformat_arg(&writer, f, &vargs2);
2898 if (f == NULL)
2899 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 const char *p;
2903 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904
Victor Stinnere215d962012-10-06 23:03:36 +02002905 p = f;
2906 do
2907 {
2908 if ((unsigned char)*p > 127) {
2909 PyErr_Format(PyExc_ValueError,
2910 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2911 "string, got a non-ASCII byte: 0x%02x",
2912 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002913 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 p++;
2916 }
2917 while (*p != '\0' && *p != '%');
2918 len = p - f;
2919
2920 if (*p == '\0')
2921 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002922
2923 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925
2926 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002928 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002929 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002930 return _PyUnicodeWriter_Finish(&writer);
2931
2932 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002933 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002934 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002935 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936}
2937
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938PyObject *
2939PyUnicode_FromFormat(const char *format, ...)
2940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002941 PyObject* ret;
2942 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
2944#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 ret = PyUnicode_FromFormatV(format, vargs);
2950 va_end(vargs);
2951 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002952}
2953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954#ifdef HAVE_WCHAR_H
2955
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002956/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957
Victor Stinnerd88d9832011-09-06 02:00:05 +02002958 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 character) required to convert the unicode object. Ignore size argument.
2960
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002962 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002963 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002964Py_ssize_t
2965PyUnicode_AsWideChar(PyObject *unicode,
2966 wchar_t *w,
2967 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002968{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002969 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 const wchar_t *wstr;
2971
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002972 if (unicode == NULL) {
2973 PyErr_BadInternalCall();
2974 return -1;
2975 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002976 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 if (wstr == NULL)
2978 return -1;
2979
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002981 if (size > res)
2982 size = res + 1;
2983 else
2984 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002985 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002986 return res;
2987 }
2988 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002990}
2991
Victor Stinner137c34c2010-09-29 10:25:54 +00002992wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002993PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 Py_ssize_t *size)
2995{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002996 const wchar_t *wstr;
2997 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002998 Py_ssize_t buflen;
2999
3000 if (unicode == NULL) {
3001 PyErr_BadInternalCall();
3002 return NULL;
3003 }
3004
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003005 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3006 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003008 }
3009 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3010 PyErr_SetString(PyExc_ValueError,
3011 "embedded null character");
3012 return NULL;
3013 }
3014
3015 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003016 if (buffer == NULL) {
3017 PyErr_NoMemory();
3018 return NULL;
3019 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003020 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003021 if (size != NULL)
3022 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003023 return buffer;
3024}
3025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003026#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003030{
Victor Stinner8faf8212011-12-08 22:14:11 +01003031 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyErr_SetString(PyExc_ValueError,
3033 "chr() arg not in range(0x110000)");
3034 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003035 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003036
Victor Stinner985a82a2014-01-03 12:53:47 +01003037 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003041PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003046 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003047 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 Py_INCREF(obj);
3049 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003050 }
3051 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 /* For a Unicode subtype that's not a Unicode object,
3053 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003054 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003056 PyErr_Format(PyExc_TypeError,
3057 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003058 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003059 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003063PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003066{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003067 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003069
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 PyErr_BadInternalCall();
3072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003074
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 /* Decoding bytes objects is the most common case and should be fast */
3076 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003077 if (PyBytes_GET_SIZE(obj) == 0)
3078 _Py_RETURN_UNICODE_EMPTY();
3079 v = PyUnicode_Decode(
3080 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3081 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003082 return v;
3083 }
3084
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003085 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 PyErr_SetString(PyExc_TypeError,
3087 "decoding str is not supported");
3088 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003090
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003091 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3092 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3093 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003094 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003095 Py_TYPE(obj)->tp_name);
3096 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003097 }
Tim Petersced69f82003-09-16 20:30:58 +00003098
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003100 PyBuffer_Release(&buffer);
3101 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003103
Serhiy Storchaka05997252013-01-26 12:14:02 +02003104 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003105 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003106 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107}
3108
Victor Stinnerebe17e02016-10-12 13:57:45 +02003109/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3110 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3111 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003112int
3113_Py_normalize_encoding(const char *encoding,
3114 char *lower,
3115 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003117 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118 char *l;
3119 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003120 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003121
Victor Stinner942889a2016-09-05 15:40:10 -07003122 assert(encoding != NULL);
3123
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003124 e = encoding;
3125 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003126 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003127 punct = 0;
3128 while (1) {
3129 char c = *e;
3130 if (c == 0) {
3131 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003132 }
Victor Stinner942889a2016-09-05 15:40:10 -07003133
3134 if (Py_ISALNUM(c) || c == '.') {
3135 if (punct && l != lower) {
3136 if (l == l_end) {
3137 return 0;
3138 }
3139 *l++ = '_';
3140 }
3141 punct = 0;
3142
3143 if (l == l_end) {
3144 return 0;
3145 }
3146 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003147 }
3148 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003149 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003150 }
Victor Stinner942889a2016-09-05 15:40:10 -07003151
3152 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003153 }
3154 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003155 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003156}
3157
Alexander Belopolsky40018472011-02-26 01:02:56 +00003158PyObject *
3159PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003160 Py_ssize_t size,
3161 const char *encoding,
3162 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003163{
3164 PyObject *buffer = NULL, *unicode;
3165 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003166 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3167
3168 if (encoding == NULL) {
3169 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3170 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003171
Fred Drakee4315f52000-05-09 19:53:39 +00003172 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003173 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3174 char *lower = buflower;
3175
3176 /* Fast paths */
3177 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3178 lower += 3;
3179 if (*lower == '_') {
3180 /* Match "utf8" and "utf_8" */
3181 lower++;
3182 }
3183
3184 if (lower[0] == '8' && lower[1] == 0) {
3185 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3186 }
3187 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3188 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3189 }
3190 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3191 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3192 }
3193 }
3194 else {
3195 if (strcmp(lower, "ascii") == 0
3196 || strcmp(lower, "us_ascii") == 0) {
3197 return PyUnicode_DecodeASCII(s, size, errors);
3198 }
Steve Dowercc16be82016-09-08 10:35:16 -07003199 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003200 else if (strcmp(lower, "mbcs") == 0) {
3201 return PyUnicode_DecodeMBCS(s, size, errors);
3202 }
3203 #endif
3204 else if (strcmp(lower, "latin1") == 0
3205 || strcmp(lower, "latin_1") == 0
3206 || strcmp(lower, "iso_8859_1") == 0
3207 || strcmp(lower, "iso8859_1") == 0) {
3208 return PyUnicode_DecodeLatin1(s, size, errors);
3209 }
3210 }
Victor Stinner37296e82010-06-10 13:36:23 +00003211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
3213 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003214 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003215 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003216 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003217 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 if (buffer == NULL)
3219 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003220 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 if (unicode == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003228 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 Py_DECREF(unicode);
3230 goto onError;
3231 }
3232 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003233 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003234
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 Py_XDECREF(buffer);
3237 return NULL;
3238}
3239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240PyObject *
3241PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003242 const char *encoding,
3243 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003244{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245 if (!PyUnicode_Check(unicode)) {
3246 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003247 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003248 }
3249
Serhiy Storchaka00939072016-10-27 21:05:49 +03003250 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3251 "PyUnicode_AsDecodedObject() is deprecated; "
3252 "use PyCodec_Decode() to decode from str", 1) < 0)
3253 return NULL;
3254
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257
3258 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003259 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003260}
3261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003264 const char *encoding,
3265 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003266{
3267 PyObject *v;
3268
3269 if (!PyUnicode_Check(unicode)) {
3270 PyErr_BadArgument();
3271 goto onError;
3272 }
3273
Serhiy Storchaka00939072016-10-27 21:05:49 +03003274 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3275 "PyUnicode_AsDecodedUnicode() is deprecated; "
3276 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3277 return NULL;
3278
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281
3282 /* Decode via the codec registry */
3283 v = PyCodec_Decode(unicode, encoding, errors);
3284 if (v == NULL)
3285 goto onError;
3286 if (!PyUnicode_Check(v)) {
3287 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003288 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3289 "use codecs.decode() to decode to arbitrary types",
3290 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003291 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292 Py_DECREF(v);
3293 goto onError;
3294 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003295 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003298 return NULL;
3299}
3300
Alexander Belopolsky40018472011-02-26 01:02:56 +00003301PyObject *
3302PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003303 Py_ssize_t size,
3304 const char *encoding,
3305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003309 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3313 Py_DECREF(unicode);
3314 return v;
3315}
3316
Alexander Belopolsky40018472011-02-26 01:02:56 +00003317PyObject *
3318PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003319 const char *encoding,
3320 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003321{
3322 PyObject *v;
3323
3324 if (!PyUnicode_Check(unicode)) {
3325 PyErr_BadArgument();
3326 goto onError;
3327 }
3328
Serhiy Storchaka00939072016-10-27 21:05:49 +03003329 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3330 "PyUnicode_AsEncodedObject() is deprecated; "
3331 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3332 "or PyCodec_Encode() for generic encoding", 1) < 0)
3333 return NULL;
3334
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003337
3338 /* Encode via the codec registry */
3339 v = PyCodec_Encode(unicode, encoding, errors);
3340 if (v == NULL)
3341 goto onError;
3342 return v;
3343
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003345 return NULL;
3346}
3347
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003348static size_t
3349wcstombs_errorpos(const wchar_t *wstr)
3350{
3351 size_t len;
3352#if SIZEOF_WCHAR_T == 2
3353 wchar_t buf[3];
3354#else
3355 wchar_t buf[2];
3356#endif
3357 char outbuf[MB_LEN_MAX];
3358 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003359
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003360#if SIZEOF_WCHAR_T == 2
3361 buf[2] = 0;
3362#else
3363 buf[1] = 0;
3364#endif
3365 start = wstr;
3366 while (*wstr != L'\0')
3367 {
3368 previous = wstr;
3369#if SIZEOF_WCHAR_T == 2
3370 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3371 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3372 {
3373 buf[0] = wstr[0];
3374 buf[1] = wstr[1];
3375 wstr += 2;
3376 }
3377 else {
3378 buf[0] = *wstr;
3379 buf[1] = 0;
3380 wstr++;
3381 }
3382#else
3383 buf[0] = *wstr;
3384 wstr++;
3385#endif
3386 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003387 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003388 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003389 }
3390
3391 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003392 return 0;
3393}
3394
Victor Stinner1b579672011-12-17 05:47:23 +01003395static int
3396locale_error_handler(const char *errors, int *surrogateescape)
3397{
Victor Stinner50149202015-09-22 00:26:54 +02003398 _Py_error_handler error_handler = get_error_handler(errors);
3399 switch (error_handler)
3400 {
3401 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003402 *surrogateescape = 0;
3403 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003404 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003405 *surrogateescape = 1;
3406 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003407 default:
3408 PyErr_Format(PyExc_ValueError,
3409 "only 'strict' and 'surrogateescape' error handlers "
3410 "are supported, not '%s'",
3411 errors);
3412 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003413 }
Victor Stinner1b579672011-12-17 05:47:23 +01003414}
3415
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003417PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418{
3419 Py_ssize_t wlen, wlen2;
3420 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003422 PyObject *bytes, *reason, *exc;
3423 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003424 int surrogateescape;
3425
3426 if (locale_error_handler(errors, &surrogateescape) < 0)
3427 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003428
3429 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3430 if (wstr == NULL)
3431 return NULL;
3432
3433 wlen2 = wcslen(wstr);
3434 if (wlen2 != wlen) {
3435 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003436 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003437 return NULL;
3438 }
3439
3440 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003441 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003442 char *str;
3443
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003444 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003445 if (str == NULL) {
3446 if (error_pos == (size_t)-1) {
3447 PyErr_NoMemory();
3448 PyMem_Free(wstr);
3449 return NULL;
3450 }
3451 else {
3452 goto encode_error;
3453 }
3454 }
3455 PyMem_Free(wstr);
3456
3457 bytes = PyBytes_FromString(str);
3458 PyMem_Free(str);
3459 }
3460 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003461 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 size_t len, len2;
3463
3464 len = wcstombs(NULL, wstr, 0);
3465 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003466 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003467 goto encode_error;
3468 }
3469
3470 bytes = PyBytes_FromStringAndSize(NULL, len);
3471 if (bytes == NULL) {
3472 PyMem_Free(wstr);
3473 return NULL;
3474 }
3475
3476 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3477 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003478 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003479 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003480 goto encode_error;
3481 }
3482 PyMem_Free(wstr);
3483 }
3484 return bytes;
3485
3486encode_error:
3487 errmsg = strerror(errno);
3488 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003489
3490 if (error_pos == (size_t)-1)
3491 error_pos = wcstombs_errorpos(wstr);
3492
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003494
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003495 wstr = Py_DecodeLocale(errmsg, &errlen);
3496 if (wstr != NULL) {
3497 reason = PyUnicode_FromWideChar(wstr, errlen);
3498 PyMem_RawFree(wstr);
3499 } else {
3500 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003501 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003502
Victor Stinner2f197072011-12-17 07:08:30 +01003503 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003504 reason = PyUnicode_FromString(
3505 "wcstombs() encountered an unencodable "
3506 "wide character");
3507 if (reason == NULL)
3508 return NULL;
3509
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 Py_DECREF(reason);
3516 if (exc != NULL) {
3517 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003518 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003519 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003520 return NULL;
3521}
3522
Victor Stinnerad158722010-10-27 00:25:46 +00003523PyObject *
3524PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003525{
Steve Dowercc16be82016-09-08 10:35:16 -07003526#if defined(__APPLE__)
3527 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003528#else
Victor Stinner793b5312011-04-27 00:24:21 +02003529 PyInterpreterState *interp = PyThreadState_GET()->interp;
3530 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3531 cannot use it to encode and decode filenames before it is loaded. Load
3532 the Python codec requires to encode at least its own filename. Use the C
3533 version of the locale codec until the codec registry is initialized and
3534 the Python codec is loaded.
3535
3536 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3537 cannot only rely on it: check also interp->fscodec_initialized for
3538 subinterpreters. */
3539 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003540 return PyUnicode_AsEncodedString(unicode,
3541 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003542 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003543 }
3544 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003545 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003546 }
Victor Stinnerad158722010-10-27 00:25:46 +00003547#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003548}
3549
Alexander Belopolsky40018472011-02-26 01:02:56 +00003550PyObject *
3551PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003552 const char *encoding,
3553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003556 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 if (!PyUnicode_Check(unicode)) {
3559 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 }
Fred Drakee4315f52000-05-09 19:53:39 +00003562
Victor Stinner942889a2016-09-05 15:40:10 -07003563 if (encoding == NULL) {
3564 return _PyUnicode_AsUTF8String(unicode, errors);
3565 }
3566
Fred Drakee4315f52000-05-09 19:53:39 +00003567 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569 char *lower = buflower;
3570
3571 /* Fast paths */
3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573 lower += 3;
3574 if (*lower == '_') {
3575 /* Match "utf8" and "utf_8" */
3576 lower++;
3577 }
3578
3579 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003580 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003581 }
3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3584 }
3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3587 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003588 }
Victor Stinner942889a2016-09-05 15:40:10 -07003589 else {
3590 if (strcmp(lower, "ascii") == 0
3591 || strcmp(lower, "us_ascii") == 0) {
3592 return _PyUnicode_AsASCIIString(unicode, errors);
3593 }
Steve Dowercc16be82016-09-08 10:35:16 -07003594#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003595 else if (strcmp(lower, "mbcs") == 0) {
3596 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3597 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003598#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003599 else if (strcmp(lower, "latin1") == 0 ||
3600 strcmp(lower, "latin_1") == 0 ||
3601 strcmp(lower, "iso_8859_1") == 0 ||
3602 strcmp(lower, "iso8859_1") == 0) {
3603 return _PyUnicode_AsLatin1String(unicode, errors);
3604 }
3605 }
Victor Stinner37296e82010-06-10 13:36:23 +00003606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607
3608 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003609 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003611 return NULL;
3612
3613 /* The normal path */
3614 if (PyBytes_Check(v))
3615 return v;
3616
3617 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003618 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003619 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003620 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003621
3622 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003623 "encoder %s returned bytearray instead of bytes; "
3624 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003625 encoding);
3626 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003627 Py_DECREF(v);
3628 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003629 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003630
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003631 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3632 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003633 Py_DECREF(v);
3634 return b;
3635 }
3636
3637 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003638 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3639 "use codecs.encode() to encode to arbitrary types",
3640 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003641 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003642 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003643 return NULL;
3644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003650{
3651 PyObject *v;
3652
3653 if (!PyUnicode_Check(unicode)) {
3654 PyErr_BadArgument();
3655 goto onError;
3656 }
3657
Serhiy Storchaka00939072016-10-27 21:05:49 +03003658 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3659 "PyUnicode_AsEncodedUnicode() is deprecated; "
3660 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3661 return NULL;
3662
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665
3666 /* Encode via the codec registry */
3667 v = PyCodec_Encode(unicode, encoding, errors);
3668 if (v == NULL)
3669 goto onError;
3670 if (!PyUnicode_Check(v)) {
3671 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003672 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3673 "use codecs.encode() to encode to arbitrary types",
3674 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003675 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003676 Py_DECREF(v);
3677 goto onError;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003680
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 return NULL;
3683}
3684
Victor Stinner2f197072011-12-17 07:08:30 +01003685static size_t
3686mbstowcs_errorpos(const char *str, size_t len)
3687{
3688#ifdef HAVE_MBRTOWC
3689 const char *start = str;
3690 mbstate_t mbs;
3691 size_t converted;
3692 wchar_t ch;
3693
3694 memset(&mbs, 0, sizeof mbs);
3695 while (len)
3696 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003697 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003698 if (converted == 0)
3699 /* Reached end of string */
3700 break;
3701 if (converted == (size_t)-1 || converted == (size_t)-2) {
3702 /* Conversion error or incomplete character */
3703 return str - start;
3704 }
3705 else {
3706 str += converted;
3707 len -= converted;
3708 }
3709 }
3710 /* failed to find the undecodable byte sequence */
3711 return 0;
3712#endif
3713 return 0;
3714}
3715
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003718 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
3720 wchar_t smallbuf[256];
3721 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3722 wchar_t *wstr;
3723 size_t wlen, wlen2;
3724 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003725 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003726 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003727 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003728 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003729
3730 if (locale_error_handler(errors, &surrogateescape) < 0)
3731 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003733 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3734 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003735 return NULL;
3736 }
3737
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003738 if (surrogateescape) {
3739 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003740 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003741 if (wstr == NULL) {
3742 if (wlen == (size_t)-1)
3743 PyErr_NoMemory();
3744 else
3745 PyErr_SetFromErrno(PyExc_OSError);
3746 return NULL;
3747 }
3748
3749 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003750 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003751 }
3752 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003753 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003754#ifndef HAVE_BROKEN_MBSTOWCS
3755 wlen = mbstowcs(NULL, str, 0);
3756#else
3757 wlen = len;
3758#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003759 if (wlen == (size_t)-1)
3760 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003761 if (wlen+1 <= smallbuf_len) {
3762 wstr = smallbuf;
3763 }
3764 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003765 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003766 if (!wstr)
3767 return PyErr_NoMemory();
3768 }
3769
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003770 wlen2 = mbstowcs(wstr, str, wlen+1);
3771 if (wlen2 == (size_t)-1) {
3772 if (wstr != smallbuf)
3773 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003774 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003775 }
3776#ifdef HAVE_BROKEN_MBSTOWCS
3777 assert(wlen2 == wlen);
3778#endif
3779 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3780 if (wstr != smallbuf)
3781 PyMem_Free(wstr);
3782 }
3783 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003784
3785decode_error:
3786 errmsg = strerror(errno);
3787 assert(errmsg != NULL);
3788
3789 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003790 wstr = Py_DecodeLocale(errmsg, &errlen);
3791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
3793 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003794 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003795
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003796 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003810 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
3812 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003813}
3814
3815PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003816PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003817{
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003820}
3821
3822
3823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003828
Christian Heimes5894ba72007-11-04 11:43:14 +00003829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
Steve Dowercc16be82016-09-08 10:35:16 -07003832#if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003834#else
Victor Stinner793b5312011-04-27 00:24:21 +02003835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003846 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003848 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 }
3850 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 }
Victor Stinnerad158722010-10-27 00:25:46 +00003853#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854}
3855
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
Brett Cannonec6ce872016-09-06 15:50:29 -07003860 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003866 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003867 return 1;
3868 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003871 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003872 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
Victor Stinner0ea2a462010-04-30 00:22:08 +00003885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003887 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003894}
3895
3896
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
Brett Cannona5711202016-09-06 19:36:01 -07003900 int is_buffer = 0;
3901 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003902 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003905 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003906 return 1;
3907 }
Brett Cannona5711202016-09-06 19:36:01 -07003908
3909 is_buffer = PyObject_CheckBuffer(arg);
3910 if (!is_buffer) {
3911 path = PyOS_FSPath(arg);
3912 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003913 return 0;
3914 }
Brett Cannona5711202016-09-06 19:36:01 -07003915 }
3916 else {
3917 path = arg;
3918 Py_INCREF(arg);
3919 }
3920
3921 if (PyUnicode_Check(path)) {
3922 if (PyUnicode_READY(path) == -1) {
3923 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003925 }
3926 output = path;
3927 }
3928 else if (PyBytes_Check(path) || is_buffer) {
3929 PyObject *path_bytes = NULL;
3930
3931 if (!PyBytes_Check(path) &&
3932 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3933 "path should be string, bytes, or os.PathLike, not %.200s",
3934 Py_TYPE(arg)->tp_name)) {
3935 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003937 }
3938 path_bytes = PyBytes_FromObject(path);
3939 Py_DECREF(path);
3940 if (!path_bytes) {
3941 return 0;
3942 }
3943 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3944 PyBytes_GET_SIZE(path_bytes));
3945 Py_DECREF(path_bytes);
3946 if (!output) {
3947 return 0;
3948 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003950 else {
3951 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003952 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003954 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 return 0;
3956 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003957 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003958 Py_DECREF(output);
3959 return 0;
3960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003962 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003963 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003964 Py_DECREF(output);
3965 return 0;
3966 }
3967 *(PyObject**)addr = output;
3968 return Py_CLEANUP_SUPPORTED;
3969}
3970
3971
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003972const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003974{
Christian Heimesf3863112007-11-22 07:46:41 +00003975 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003977 if (!PyUnicode_Check(unicode)) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003981 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003984 if (PyUnicode_UTF8(unicode) == NULL) {
3985 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003986 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (bytes == NULL)
3988 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3990 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003991 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_DECREF(bytes);
3993 return NULL;
3994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003996 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 PyBytes_AS_STRING(bytes),
3998 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 Py_DECREF(bytes);
4000 }
4001
4002 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004003 *psize = PyUnicode_UTF8_LENGTH(unicode);
4004 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004005}
4006
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004007const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4011}
4012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013Py_UNICODE *
4014PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 const unsigned char *one_byte;
4017#if SIZEOF_WCHAR_T == 4
4018 const Py_UCS2 *two_bytes;
4019#else
4020 const Py_UCS4 *four_bytes;
4021 const Py_UCS4 *ucs4_end;
4022 Py_ssize_t num_surrogates;
4023#endif
4024 wchar_t *w;
4025 wchar_t *wchar_end;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return NULL;
4030 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 assert(_PyUnicode_KIND(unicode) != 0);
4034 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4039 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 num_surrogates = 0;
4041
4042 for (; four_bytes < ucs4_end; ++four_bytes) {
4043 if (*four_bytes > 0xFFFF)
4044 ++num_surrogates;
4045 }
4046
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4048 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4049 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyErr_NoMemory();
4051 return NULL;
4052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 w = _PyUnicode_WSTR(unicode);
4056 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4057 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4059 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004060 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004062 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4063 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 }
4065 else
4066 *w = *four_bytes;
4067
4068 if (w > wchar_end) {
4069 assert(0 && "Miscalculated string end");
4070 }
4071 }
4072 *w = 0;
4073#else
4074 /* sizeof(wchar_t) == 4 */
4075 Py_FatalError("Impossible unicode object state, wstr and str "
4076 "should share memory already.");
4077 return NULL;
4078#endif
4079 }
4080 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004081 if ((size_t)_PyUnicode_LENGTH(unicode) >
4082 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4083 PyErr_NoMemory();
4084 return NULL;
4085 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004086 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4087 (_PyUnicode_LENGTH(unicode) + 1));
4088 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 PyErr_NoMemory();
4090 return NULL;
4091 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4093 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4094 w = _PyUnicode_WSTR(unicode);
4095 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4098 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 for (; w < wchar_end; ++one_byte, ++w)
4100 *w = *one_byte;
4101 /* null-terminate the wstr */
4102 *w = 0;
4103 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 for (; w < wchar_end; ++two_bytes, ++w)
4108 *w = *two_bytes;
4109 /* null-terminate the wstr */
4110 *w = 0;
4111#else
4112 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004113 PyObject_FREE(_PyUnicode_WSTR(unicode));
4114 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 Py_FatalError("Impossible unicode object state, wstr "
4116 "and str should share memory already.");
4117 return NULL;
4118#endif
4119 }
4120 else {
4121 assert(0 && "This should never happen.");
4122 }
4123 }
4124 }
4125 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004126 *size = PyUnicode_WSTR_LENGTH(unicode);
4127 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004128}
4129
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130Py_UNICODE *
4131PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134}
4135
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004136const Py_UNICODE *
4137_PyUnicode_AsUnicode(PyObject *unicode)
4138{
4139 Py_ssize_t size;
4140 const Py_UNICODE *wstr;
4141
4142 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4143 if (wstr && wcslen(wstr) != (size_t)size) {
4144 PyErr_SetString(PyExc_ValueError, "embedded null character");
4145 return NULL;
4146 }
4147 return wstr;
4148}
4149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150
Alexander Belopolsky40018472011-02-26 01:02:56 +00004151Py_ssize_t
4152PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153{
4154 if (!PyUnicode_Check(unicode)) {
4155 PyErr_BadArgument();
4156 goto onError;
4157 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004158 if (_PyUnicode_WSTR(unicode) == NULL) {
4159 if (PyUnicode_AsUnicode(unicode) == NULL)
4160 goto onError;
4161 }
4162 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 return -1;
4166}
4167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168Py_ssize_t
4169PyUnicode_GetLength(PyObject *unicode)
4170{
Victor Stinner07621332012-06-16 04:53:46 +02004171 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 PyErr_BadArgument();
4173 return -1;
4174 }
Victor Stinner07621332012-06-16 04:53:46 +02004175 if (PyUnicode_READY(unicode) == -1)
4176 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 return PyUnicode_GET_LENGTH(unicode);
4178}
4179
4180Py_UCS4
4181PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4182{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004183 void *data;
4184 int kind;
4185
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004186 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4187 PyErr_BadArgument();
4188 return (Py_UCS4)-1;
4189 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004190 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004191 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 return (Py_UCS4)-1;
4193 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004194 data = PyUnicode_DATA(unicode);
4195 kind = PyUnicode_KIND(unicode);
4196 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197}
4198
4199int
4200PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4201{
4202 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004203 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004204 return -1;
4205 }
Victor Stinner488fa492011-12-12 00:01:39 +01004206 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004207 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004208 PyErr_SetString(PyExc_IndexError, "string index out of range");
4209 return -1;
4210 }
Victor Stinner488fa492011-12-12 00:01:39 +01004211 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004212 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004213 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4214 PyErr_SetString(PyExc_ValueError, "character out of range");
4215 return -1;
4216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004217 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4218 index, ch);
4219 return 0;
4220}
4221
Alexander Belopolsky40018472011-02-26 01:02:56 +00004222const char *
4223PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004224{
Victor Stinner42cb4622010-09-01 19:39:01 +00004225 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004226}
4227
Victor Stinner554f3f02010-06-16 23:33:54 +00004228/* create or adjust a UnicodeDecodeError */
4229static void
4230make_decode_exception(PyObject **exceptionObject,
4231 const char *encoding,
4232 const char *input, Py_ssize_t length,
4233 Py_ssize_t startpos, Py_ssize_t endpos,
4234 const char *reason)
4235{
4236 if (*exceptionObject == NULL) {
4237 *exceptionObject = PyUnicodeDecodeError_Create(
4238 encoding, input, length, startpos, endpos, reason);
4239 }
4240 else {
4241 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4242 goto onError;
4243 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4244 goto onError;
4245 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4246 goto onError;
4247 }
4248 return;
4249
4250onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004251 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004252}
4253
Steve Dowercc16be82016-09-08 10:35:16 -07004254#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255/* error handling callback helper:
4256 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004257 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 and adjust various state variables.
4259 return 0 on success, -1 on error
4260*/
4261
Alexander Belopolsky40018472011-02-26 01:02:56 +00004262static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263unicode_decode_call_errorhandler_wchar(
4264 const char *errors, PyObject **errorHandler,
4265 const char *encoding, const char *reason,
4266 const char **input, const char **inend, Py_ssize_t *startinpos,
4267 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4268 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004270 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271
4272 PyObject *restuple = NULL;
4273 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004274 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004276 Py_ssize_t requiredsize;
4277 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004278 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 wchar_t *repwstr;
4280 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4283 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 *errorHandler = PyCodec_LookupError(errors);
4287 if (*errorHandler == NULL)
4288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 }
4290
Victor Stinner554f3f02010-06-16 23:33:54 +00004291 make_decode_exception(exceptionObject,
4292 encoding,
4293 *input, *inend - *input,
4294 *startinpos, *endinpos,
4295 reason);
4296 if (*exceptionObject == NULL)
4297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004299 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004303 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004306 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308
4309 /* Copy back the bytes variables, which might have been modified by the
4310 callback */
4311 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4312 if (!inputobj)
4313 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 *input = PyBytes_AS_STRING(inputobj);
4315 insize = PyBytes_GET_SIZE(inputobj);
4316 *inend = *input + insize;
4317 /* we can DECREF safely, as the exception has another reference,
4318 so the object won't go away. */
4319 Py_DECREF(inputobj);
4320
4321 if (newpos<0)
4322 newpos = insize+newpos;
4323 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 goto onError;
4326 }
4327
4328 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4329 if (repwstr == NULL)
4330 goto onError;
4331 /* need more space? (at least enough for what we
4332 have+the replacement+the rest of the string (starting
4333 at the new input position), so we won't have to check space
4334 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004335 requiredsize = *outpos;
4336 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4337 goto overflow;
4338 requiredsize += repwlen;
4339 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4340 goto overflow;
4341 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004343 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 requiredsize = 2*outsize;
4345 if (unicode_resize(output, requiredsize) < 0)
4346 goto onError;
4347 }
4348 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4349 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 *endinpos = newpos;
4351 *inptr = *input + newpos;
4352
4353 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004354 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355 return 0;
4356
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004357 overflow:
4358 PyErr_SetString(PyExc_OverflowError,
4359 "decoded result is too long for a Python string");
4360
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004361 onError:
4362 Py_XDECREF(restuple);
4363 return -1;
4364}
Steve Dowercc16be82016-09-08 10:35:16 -07004365#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366
4367static int
4368unicode_decode_call_errorhandler_writer(
4369 const char *errors, PyObject **errorHandler,
4370 const char *encoding, const char *reason,
4371 const char **input, const char **inend, Py_ssize_t *startinpos,
4372 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4373 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4374{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004375 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004376
4377 PyObject *restuple = NULL;
4378 PyObject *repunicode = NULL;
4379 Py_ssize_t insize;
4380 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004381 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004382 PyObject *inputobj = NULL;
4383
4384 if (*errorHandler == NULL) {
4385 *errorHandler = PyCodec_LookupError(errors);
4386 if (*errorHandler == NULL)
4387 goto onError;
4388 }
4389
4390 make_decode_exception(exceptionObject,
4391 encoding,
4392 *input, *inend - *input,
4393 *startinpos, *endinpos,
4394 reason);
4395 if (*exceptionObject == NULL)
4396 goto onError;
4397
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004398 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 if (restuple == NULL)
4400 goto onError;
4401 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004402 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403 goto onError;
4404 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004405 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004407
4408 /* Copy back the bytes variables, which might have been modified by the
4409 callback */
4410 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411 if (!inputobj)
4412 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004413 *input = PyBytes_AS_STRING(inputobj);
4414 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004416 /* we can DECREF safely, as the exception has another reference,
4417 so the object won't go away. */
4418 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004422 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004425 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426
Victor Stinner170ca6f2013-04-18 00:25:28 +02004427 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004428 if (replen > 1) {
4429 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004430 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004431 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4432 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4433 goto onError;
4434 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004436 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004439 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004442 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448}
4449
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450/* --- UTF-7 Codec -------------------------------------------------------- */
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452/* See RFC2152 for details. We encode conservatively and decode liberally. */
4453
4454/* Three simple macros defining base-64. */
4455
4456/* Is c a base-64 character? */
4457
4458#define IS_BASE64(c) \
4459 (((c) >= 'A' && (c) <= 'Z') || \
4460 ((c) >= 'a' && (c) <= 'z') || \
4461 ((c) >= '0' && (c) <= '9') || \
4462 (c) == '+' || (c) == '/')
4463
4464/* given that c is a base-64 character, what is its base-64 value? */
4465
4466#define FROM_BASE64(c) \
4467 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4468 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4469 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4470 (c) == '+' ? 62 : 63)
4471
4472/* What is the base-64 character of the bottom 6 bits of n? */
4473
4474#define TO_BASE64(n) \
4475 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4476
4477/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4478 * decoded as itself. We are permissive on decoding; the only ASCII
4479 * byte not decoding to itself is the + which begins a base64
4480 * string. */
4481
4482#define DECODE_DIRECT(c) \
4483 ((c) <= 127 && (c) != '+')
4484
4485/* The UTF-7 encoder treats ASCII characters differently according to
4486 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4487 * the above). See RFC2152. This array identifies these different
4488 * sets:
4489 * 0 : "Set D"
4490 * alphanumeric and '(),-./:?
4491 * 1 : "Set O"
4492 * !"#$%&*;<=>@[]^_`{|}
4493 * 2 : "whitespace"
4494 * ht nl cr sp
4495 * 3 : special (must be base64 encoded)
4496 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4497 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498
Tim Petersced69f82003-09-16 20:30:58 +00004499static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500char utf7_category[128] = {
4501/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4503/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4504 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4505/* sp ! " # $ % & ' ( ) * + , - . / */
4506 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4507/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4509/* @ A B C D E F G H I J K L M N O */
4510 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4511/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4513/* ` a b c d e f g h i j k l m n o */
4514 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4515/* p q r s t u v w x y z { | } ~ del */
4516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517};
4518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519/* ENCODE_DIRECT: this character should be encoded as itself. The
4520 * answer depends on whether we are encoding set O as itself, and also
4521 * on whether we are encoding whitespace as itself. RFC2152 makes it
4522 * clear that the answers to these questions vary between
4523 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525#define ENCODE_DIRECT(c, directO, directWS) \
4526 ((c) < 128 && (c) > 0 && \
4527 ((utf7_category[(c)] == 0) || \
4528 (directWS && (utf7_category[(c)] == 2)) || \
4529 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Alexander Belopolsky40018472011-02-26 01:02:56 +00004531PyObject *
4532PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004533 Py_ssize_t size,
4534 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004536 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4537}
4538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539/* The decoder. The only state we preserve is our read position,
4540 * i.e. how many characters we have consumed. So if we end in the
4541 * middle of a shift sequence we have to back off the read position
4542 * and the output to the beginning of the sequence, otherwise we lose
4543 * all the shift state (seen bits, number of bits seen, high
4544 * surrogate). */
4545
Alexander Belopolsky40018472011-02-26 01:02:56 +00004546PyObject *
4547PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004548 Py_ssize_t size,
4549 const char *errors,
4550 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004551{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553 Py_ssize_t startinpos;
4554 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004556 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 const char *errmsg = "";
4558 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004559 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 unsigned int base64bits = 0;
4561 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 PyObject *errorHandler = NULL;
4564 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 if (size == 0) {
4567 if (consumed)
4568 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004569 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004570 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004572 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004573 _PyUnicodeWriter_Init(&writer);
4574 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575
4576 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577 e = s + size;
4578
4579 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004582 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (inShift) { /* in a base-64 section */
4585 if (IS_BASE64(ch)) { /* consume a base-64 character */
4586 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4587 base64bits += 6;
4588 s++;
4589 if (base64bits >= 16) {
4590 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004591 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 base64bits -= 16;
4593 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004594 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (surrogate) {
4596 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004597 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4598 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004599 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004600 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004602 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 }
4604 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004605 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004606 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 }
Victor Stinner551ac952011-11-29 22:58:13 +01004610 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 /* first surrogate */
4612 surrogate = outCh;
4613 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004615 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004616 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 }
4618 }
4619 }
4620 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (base64bits > 0) { /* left-over bits */
4623 if (base64bits >= 6) {
4624 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004625 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 errmsg = "partial character in shift sequence";
4627 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 else {
4630 /* Some bits remain; they should be zero */
4631 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004632 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 errmsg = "non-zero padding bits in shift sequence";
4634 goto utf7Error;
4635 }
4636 }
4637 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 if (surrogate && DECODE_DIRECT(ch)) {
4639 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4640 goto onError;
4641 }
4642 surrogate = 0;
4643 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 /* '-' is absorbed; other terminating
4645 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004646 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
4649 }
4650 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 s++; /* consume '+' */
4653 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004655 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 }
4658 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004660 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004661 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004663 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 }
4665 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004668 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 else {
4672 startinpos = s-starts;
4673 s++;
4674 errmsg = "unexpected special character";
4675 goto utf7Error;
4676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 errors, &errorHandler,
4682 "utf7", errmsg,
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 }
4687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 /* end of string */
4689
4690 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4691 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004692 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 if (surrogate ||
4694 (base64bits >= 6) ||
4695 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004697 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 errors, &errorHandler,
4699 "utf7", "unterminated shift sequence",
4700 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004701 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 goto onError;
4703 if (s < e)
4704 goto restart;
4705 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707
4708 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004711 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004712 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004713 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004714 writer.kind, writer.data, shiftOutStart);
4715 Py_XDECREF(errorHandler);
4716 Py_XDECREF(exc);
4717 _PyUnicodeWriter_Dealloc(&writer);
4718 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004719 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004720 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004721 }
4722 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004723 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004725 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 Py_XDECREF(errorHandler);
4728 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(errorHandler);
4733 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 return NULL;
4736}
4737
4738
Alexander Belopolsky40018472011-02-26 01:02:56 +00004739PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740_PyUnicode_EncodeUTF7(PyObject *str,
4741 int base64SetO,
4742 int base64WhiteSpace,
4743 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004745 int kind;
4746 void *data;
4747 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 unsigned int base64bits = 0;
4752 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753 char * out;
4754 char * start;
4755
Benjamin Petersonbac79492012-01-14 13:34:47 -05004756 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004757 return NULL;
4758 kind = PyUnicode_KIND(str);
4759 data = PyUnicode_DATA(str);
4760 len = PyUnicode_GET_LENGTH(str);
4761
4762 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004765 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004766 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004767 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004768 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769 if (v == NULL)
4770 return NULL;
4771
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004772 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004773 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004774 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 if (inShift) {
4777 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4778 /* shifting out */
4779 if (base64bits) { /* output remaining bits */
4780 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4781 base64buffer = 0;
4782 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004783 }
4784 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 /* Characters not in the BASE64 set implicitly unshift the sequence
4786 so no '-' is required, except if the character is itself a '-' */
4787 if (IS_BASE64(ch) || ch == '-') {
4788 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 *out++ = (char) ch;
4791 }
4792 else {
4793 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004794 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 else { /* not in a shift sequence */
4797 if (ch == '+') {
4798 *out++ = '+';
4799 *out++ = '-';
4800 }
4801 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4802 *out++ = (char) ch;
4803 }
4804 else {
4805 *out++ = '+';
4806 inShift = 1;
4807 goto encode_char;
4808 }
4809 }
4810 continue;
4811encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004813 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004814
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 /* code first surrogate */
4816 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004817 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004818 while (base64bits >= 6) {
4819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4820 base64bits -= 6;
4821 }
4822 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004823 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 base64bits += 16;
4826 base64buffer = (base64buffer << 16) | ch;
4827 while (base64bits >= 6) {
4828 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4829 base64bits -= 6;
4830 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004831 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 if (base64bits)
4833 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4834 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004835 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004836 if (_PyBytes_Resize(&v, out - start) < 0)
4837 return NULL;
4838 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004840PyObject *
4841PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4842 Py_ssize_t size,
4843 int base64SetO,
4844 int base64WhiteSpace,
4845 const char *errors)
4846{
4847 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004848 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004849 if (tmp == NULL)
4850 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004851 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004852 base64WhiteSpace, errors);
4853 Py_DECREF(tmp);
4854 return result;
4855}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004856
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857#undef IS_BASE64
4858#undef FROM_BASE64
4859#undef TO_BASE64
4860#undef DECODE_DIRECT
4861#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863/* --- UTF-8 Codec -------------------------------------------------------- */
4864
Alexander Belopolsky40018472011-02-26 01:02:56 +00004865PyObject *
4866PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004867 Py_ssize_t size,
4868 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Walter Dörwald69652032004-09-07 20:24:22 +00004870 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4871}
4872
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873#include "stringlib/asciilib.h"
4874#include "stringlib/codecs.h"
4875#include "stringlib/undef.h"
4876
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004877#include "stringlib/ucs1lib.h"
4878#include "stringlib/codecs.h"
4879#include "stringlib/undef.h"
4880
4881#include "stringlib/ucs2lib.h"
4882#include "stringlib/codecs.h"
4883#include "stringlib/undef.h"
4884
4885#include "stringlib/ucs4lib.h"
4886#include "stringlib/codecs.h"
4887#include "stringlib/undef.h"
4888
Antoine Pitrouab868312009-01-10 15:40:25 +00004889/* Mask to quickly check whether a C 'long' contains a
4890 non-ASCII, UTF8-encoded char. */
4891#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004892# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004893#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004894# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004895#else
4896# error C 'long' size should be either 4 or 8!
4897#endif
4898
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899static Py_ssize_t
4900ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004901{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004903 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004905 /*
4906 * Issue #17237: m68k is a bit different from most architectures in
4907 * that objects do not use "natural alignment" - for example, int and
4908 * long are only aligned at 2-byte boundaries. Therefore the assert()
4909 * won't work; also, tests have shown that skipping the "optimised
4910 * version" will even speed up m68k.
4911 */
4912#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004914 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4915 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 /* Fast path, see in STRINGLIB(utf8_decode) for
4917 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004918 /* Help allocation */
4919 const char *_p = p;
4920 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (_p < aligned_end) {
4922 unsigned long value = *(const unsigned long *) _p;
4923 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 *((unsigned long *)q) = value;
4926 _p += SIZEOF_LONG;
4927 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004928 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 p = _p;
4930 while (p < end) {
4931 if ((unsigned char)*p & 0x80)
4932 break;
4933 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004938#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 while (p < end) {
4940 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4941 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004942 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004943 /* Help allocation */
4944 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 while (_p < aligned_end) {
4946 unsigned long value = *(unsigned long *) _p;
4947 if (value & ASCII_CHAR_MASK)
4948 break;
4949 _p += SIZEOF_LONG;
4950 }
4951 p = _p;
4952 if (_p == end)
4953 break;
4954 }
4955 if ((unsigned char)*p & 0x80)
4956 break;
4957 ++p;
4958 }
4959 memcpy(dest, start, p - start);
4960 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961}
Antoine Pitrouab868312009-01-10 15:40:25 +00004962
Victor Stinner785938e2011-12-11 20:09:03 +01004963PyObject *
4964PyUnicode_DecodeUTF8Stateful(const char *s,
4965 Py_ssize_t size,
4966 const char *errors,
4967 Py_ssize_t *consumed)
4968{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004970 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972
4973 Py_ssize_t startinpos;
4974 Py_ssize_t endinpos;
4975 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004978 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004979
4980 if (size == 0) {
4981 if (consumed)
4982 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004984 }
4985
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4987 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004988 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 *consumed = 1;
4990 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004991 }
4992
Victor Stinner8f674cc2013-04-17 23:02:17 +02004993 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004994 writer.min_length = size;
4995 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004997
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 writer.pos = ascii_decode(s, end, writer.data);
4999 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 while (s < end) {
5001 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005003
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 if (PyUnicode_IS_ASCII(writer.buffer))
5006 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 } else {
5012 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 }
5015
5016 switch (ch) {
5017 case 0:
5018 if (s == end || consumed)
5019 goto End;
5020 errmsg = "unexpected end of data";
5021 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005022 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 break;
5024 case 1:
5025 errmsg = "invalid start byte";
5026 startinpos = s - starts;
5027 endinpos = startinpos + 1;
5028 break;
5029 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005030 case 3:
5031 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 errmsg = "invalid continuation byte";
5033 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005034 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035 break;
5036 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005037 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 goto onError;
5039 continue;
5040 }
5041
Victor Stinner1d65d912015-10-05 13:43:50 +02005042 if (error_handler == _Py_ERROR_UNKNOWN)
5043 error_handler = get_error_handler(errors);
5044
5045 switch (error_handler) {
5046 case _Py_ERROR_IGNORE:
5047 s += (endinpos - startinpos);
5048 break;
5049
5050 case _Py_ERROR_REPLACE:
5051 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5052 goto onError;
5053 s += (endinpos - startinpos);
5054 break;
5055
5056 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005057 {
5058 Py_ssize_t i;
5059
Victor Stinner1d65d912015-10-05 13:43:50 +02005060 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5061 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005062 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 ch = (Py_UCS4)(unsigned char)(starts[i]);
5064 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5065 ch + 0xdc00);
5066 writer.pos++;
5067 }
5068 s += (endinpos - startinpos);
5069 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005070 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005071
5072 default:
5073 if (unicode_decode_call_errorhandler_writer(
5074 errors, &error_handler_obj,
5075 "utf-8", errmsg,
5076 &starts, &end, &startinpos, &endinpos, &exc, &s,
5077 &writer))
5078 goto onError;
5079 }
Victor Stinner785938e2011-12-11 20:09:03 +01005080 }
5081
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 if (consumed)
5084 *consumed = s - starts;
5085
Victor Stinner1d65d912015-10-05 13:43:50 +02005086 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005088 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089
5090onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005091 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005093 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005095}
5096
Xavier de Gaye76febd02016-12-15 20:59:58 +01005097#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098
5099/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005100 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005101
5102 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005103 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104
5105wchar_t*
5106_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5107{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 wchar_t *unicode;
5110 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111
5112 /* Note: size will always be longer than the resulting Unicode
5113 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005114 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005116 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117 if (!unicode)
5118 return NULL;
5119
5120 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 if (ch > 0xFF) {
5131#if SIZEOF_WCHAR_T == 4
5132 assert(0);
5133#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005134 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 /* compute and append the two surrogates: */
5136 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5138#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 else {
5141 if (!ch && s == e)
5142 break;
5143 /* surrogateescape */
5144 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5145 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148 return unicode;
5149}
5150
Xavier de Gaye76febd02016-12-15 20:59:58 +01005151#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153/* Primary internal function which creates utf8 encoded bytes objects.
5154
5155 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005156 and allocate exactly as much space needed at the end. Else allocate the
5157 maximum possible needed (4 result bytes per Unicode character), and return
5158 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005159*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005160PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Victor Stinner6099a032011-12-18 14:22:26 +01005163 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 void *data;
5165 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167 if (!PyUnicode_Check(unicode)) {
5168 PyErr_BadArgument();
5169 return NULL;
5170 }
5171
5172 if (PyUnicode_READY(unicode) == -1)
5173 return NULL;
5174
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005175 if (PyUnicode_UTF8(unicode))
5176 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5177 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178
5179 kind = PyUnicode_KIND(unicode);
5180 data = PyUnicode_DATA(unicode);
5181 size = PyUnicode_GET_LENGTH(unicode);
5182
Benjamin Petersonead6b532011-12-20 17:23:42 -06005183 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005184 default:
5185 assert(0);
5186 case PyUnicode_1BYTE_KIND:
5187 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5188 assert(!PyUnicode_IS_ASCII(unicode));
5189 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_2BYTE_KIND:
5191 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_4BYTE_KIND:
5193 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5199 Py_ssize_t size,
5200 const char *errors)
5201{
5202 PyObject *v, *unicode;
5203
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005204 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 if (unicode == NULL)
5206 return NULL;
5207 v = _PyUnicode_AsUTF8String(unicode, errors);
5208 Py_DECREF(unicode);
5209 return v;
5210}
5211
5212PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005215 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216}
5217
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218/* --- UTF-32 Codec ------------------------------------------------------- */
5219
5220PyObject *
5221PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder,
5234 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235{
5236 const char *starts = s;
5237 Py_ssize_t startinpos;
5238 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005240 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005241 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005242 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 PyObject *errorHandler = NULL;
5245 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005246
Walter Dörwald41980ca2007-08-16 21:55:45 +00005247 q = (unsigned char *)s;
5248 e = q + size;
5249
5250 if (byteorder)
5251 bo = *byteorder;
5252
5253 /* Check for BOM marks (U+FEFF) in the input and adjust current
5254 byte order setting accordingly. In native mode, the leading BOM
5255 mark is skipped, in all other modes, it is copied to the output
5256 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005257 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005258 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005259 if (bom == 0x0000FEFF) {
5260 bo = -1;
5261 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005263 else if (bom == 0xFFFE0000) {
5264 bo = 1;
5265 q += 4;
5266 }
5267 if (byteorder)
5268 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269 }
5270
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 if (q == e) {
5272 if (consumed)
5273 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005274 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275 }
5276
Victor Stinnere64322e2012-10-30 23:12:47 +01005277#ifdef WORDS_BIGENDIAN
5278 le = bo < 0;
5279#else
5280 le = bo <= 0;
5281#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005282 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005283
Victor Stinner8f674cc2013-04-17 23:02:17 +02005284 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005285 writer.min_length = (e - q + 3) / 4;
5286 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005287 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005288
Victor Stinnere64322e2012-10-30 23:12:47 +01005289 while (1) {
5290 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005291 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005292
Victor Stinnere64322e2012-10-30 23:12:47 +01005293 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005294 enum PyUnicode_Kind kind = writer.kind;
5295 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005297 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (le) {
5299 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005300 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 if (ch > maxch)
5302 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005303 if (kind != PyUnicode_1BYTE_KIND &&
5304 Py_UNICODE_IS_SURROGATE(ch))
5305 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005307 q += 4;
5308 } while (q <= last);
5309 }
5310 else {
5311 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005312 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 if (ch > maxch)
5314 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (kind != PyUnicode_1BYTE_KIND &&
5316 Py_UNICODE_IS_SURROGATE(ch))
5317 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 q += 4;
5320 } while (q <= last);
5321 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 }
5324
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005326 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 startinpos = ((const char *)q) - starts;
5328 endinpos = startinpos + 4;
5329 }
5330 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 startinpos = ((const char *)q) - starts;
5336 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005338 else {
5339 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005340 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 goto onError;
5342 q += 4;
5343 continue;
5344 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005345 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 startinpos = ((const char *)q) - starts;
5347 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005349
5350 /* The remaining input chars are ignored if the callback
5351 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358 }
5359
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 Py_XDECREF(errorHandler);
5364 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369 Py_XDECREF(errorHandler);
5370 Py_XDECREF(exc);
5371 return NULL;
5372}
5373
5374PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375_PyUnicode_EncodeUTF32(PyObject *str,
5376 const char *errors,
5377 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005379 enum PyUnicode_Kind kind;
5380 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005381 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005382 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005383 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005384#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005391 PyObject *errorHandler = NULL;
5392 PyObject *exc = NULL;
5393 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 if (!PyUnicode_Check(str)) {
5396 PyErr_BadArgument();
5397 return NULL;
5398 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005399 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 return NULL;
5401 kind = PyUnicode_KIND(str);
5402 data = PyUnicode_DATA(str);
5403 len = PyUnicode_GET_LENGTH(str);
5404
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005405 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005406 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005408 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409 if (v == NULL)
5410 return NULL;
5411
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 /* output buffer is 4-bytes aligned */
5413 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005414 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005416 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005421 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 else
5425 encoding = "utf-32";
5426
5427 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5429 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005430 }
5431
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005432 pos = 0;
5433 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005435
5436 if (kind == PyUnicode_2BYTE_KIND) {
5437 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5438 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 else {
5441 assert(kind == PyUnicode_4BYTE_KIND);
5442 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5443 &out, native_ordering);
5444 }
5445 if (pos == len)
5446 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005447
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005448 rep = unicode_encode_call_errorhandler(
5449 errors, &errorHandler,
5450 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005451 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 if (!rep)
5453 goto error;
5454
5455 if (PyBytes_Check(rep)) {
5456 repsize = PyBytes_GET_SIZE(rep);
5457 if (repsize & 3) {
5458 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 "surrogates not allowed");
5461 goto error;
5462 }
5463 moreunits = repsize / 4;
5464 }
5465 else {
5466 assert(PyUnicode_Check(rep));
5467 if (PyUnicode_READY(rep) < 0)
5468 goto error;
5469 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5470 if (!PyUnicode_IS_ASCII(rep)) {
5471 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005472 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 "surrogates not allowed");
5474 goto error;
5475 }
5476 }
5477
5478 /* four bytes are reserved for each surrogate */
5479 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005480 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 Py_ssize_t morebytes = 4 * (moreunits - 1);
5482 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5483 /* integer overflow */
5484 PyErr_NoMemory();
5485 goto error;
5486 }
5487 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5488 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005489 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 }
5491
5492 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005493 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5498 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 }
5500
5501 Py_CLEAR(rep);
5502 }
5503
5504 /* Cut back to size actually needed. This is necessary for, for example,
5505 encoding of a string containing isolated surrogates and the 'ignore'
5506 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005507 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 if (nsize != PyBytes_GET_SIZE(v))
5509 _PyBytes_Resize(&v, nsize);
5510 Py_XDECREF(errorHandler);
5511 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 error:
5515 Py_XDECREF(rep);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
5518 Py_XDECREF(v);
5519 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520}
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005529 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539{
Victor Stinnerb960b342011-11-20 19:12:52 +01005540 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005541}
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543/* --- UTF-16 Codec ------------------------------------------------------- */
5544
Tim Peters772747b2001-08-09 22:21:55 +00005545PyObject *
5546PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 Py_ssize_t size,
5548 const char *errors,
5549 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Walter Dörwald69652032004-09-07 20:24:22 +00005551 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5552}
5553
5554PyObject *
5555PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder,
5559 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005566 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005568 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Tim Peters772747b2001-08-09 22:21:55 +00005573 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
5576 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005577 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 if (bo == 0 && size >= 2) {
5584 const Py_UCS4 bom = (q[1] << 8) | q[0];
5585 if (bom == 0xFEFF) {
5586 q += 2;
5587 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 else if (bom == 0xFFFE) {
5590 q += 2;
5591 bo = 1;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005600 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005601 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602
Christian Heimes743e0cd2012-10-17 23:52:17 +02005603#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005606#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005609#endif
Tim Peters772747b2001-08-09 22:21:55 +00005610
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 /* Note: size will always be longer than the resulting Unicode
5612 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005613 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005614 writer.min_length = (e - q + 1) / 2;
5615 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 while (1) {
5619 Py_UCS4 ch = 0;
5620 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 native_ordering);
5627 else
5628 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 native_ordering);
5631 } else if (kind == PyUnicode_2BYTE_KIND) {
5632 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 } else {
5636 assert(kind == PyUnicode_4BYTE_KIND);
5637 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 switch (ch)
5644 {
5645 case 0:
5646 /* remaining byte at the end? (size should be even) */
5647 if (q == e || consumed)
5648 goto End;
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
5652 break;
5653 /* The remaining input chars are ignored if the callback
5654 chooses to skip the input */
5655 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005656 q -= 2;
5657 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005658 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005660 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 endinpos = ((const char *)e) - starts;
5662 break;
5663 case 2:
5664 errmsg = "illegal encoding";
5665 startinpos = ((const char *)q) - 2 - starts;
5666 endinpos = startinpos + 2;
5667 break;
5668 case 3:
5669 errmsg = "illegal UTF-16 surrogate";
5670 startinpos = ((const char *)q) - 4 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005674 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 continue;
5677 }
5678
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005680 errors,
5681 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005682 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005683 &starts,
5684 (const char **)&e,
5685 &startinpos,
5686 &endinpos,
5687 &exc,
5688 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692
Antoine Pitrou63065d72012-05-15 23:48:04 +02005693End:
Walter Dörwald69652032004-09-07 20:24:22 +00005694 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706}
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 enum PyUnicode_Kind kind;
5714 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005719#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005723#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 const char *encoding;
5725 Py_ssize_t nsize, pos;
5726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005741 if (kind == PyUnicode_4BYTE_KIND) {
5742 const Py_UCS4 *in = (const Py_UCS4 *)data;
5743 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 while (in < end) {
5745 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005746 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
5748 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 nsize = len + pairs + (byteorder == 0);
5754 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005760 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
5765 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Tim Peters772747b2001-08-09 22:21:55 +00005768
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 if (kind == PyUnicode_1BYTE_KIND) {
5770 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
5780 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783
5784 pos = 0;
5785 while (pos < len) {
5786 Py_ssize_t repsize, moreunits;
5787
5788 if (kind == PyUnicode_2BYTE_KIND) {
5789 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5790 &out, native_ordering);
5791 }
5792 else {
5793 assert(kind == PyUnicode_4BYTE_KIND);
5794 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 if (pos == len)
5798 break;
5799
5800 rep = unicode_encode_call_errorhandler(
5801 errors, &errorHandler,
5802 encoding, "surrogates not allowed",
5803 str, &exc, pos, pos + 1, &pos);
5804 if (!rep)
5805 goto error;
5806
5807 if (PyBytes_Check(rep)) {
5808 repsize = PyBytes_GET_SIZE(rep);
5809 if (repsize & 1) {
5810 raise_encode_exception(&exc, encoding,
5811 str, pos - 1, pos,
5812 "surrogates not allowed");
5813 goto error;
5814 }
5815 moreunits = repsize / 2;
5816 }
5817 else {
5818 assert(PyUnicode_Check(rep));
5819 if (PyUnicode_READY(rep) < 0)
5820 goto error;
5821 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5822 if (!PyUnicode_IS_ASCII(rep)) {
5823 raise_encode_exception(&exc, encoding,
5824 str, pos - 1, pos,
5825 "surrogates not allowed");
5826 goto error;
5827 }
5828 }
5829
5830 /* two bytes are reserved for each surrogate */
5831 if (moreunits > 1) {
5832 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5833 Py_ssize_t morebytes = 2 * (moreunits - 1);
5834 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5835 /* integer overflow */
5836 PyErr_NoMemory();
5837 goto error;
5838 }
5839 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5840 goto error;
5841 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5842 }
5843
5844 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005845 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 out += moreunits;
5847 } else /* rep is unicode */ {
5848 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5849 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5850 &out, native_ordering);
5851 }
5852
5853 Py_CLEAR(rep);
5854 }
5855
5856 /* Cut back to size actually needed. This is necessary for, for example,
5857 encoding of a string containing isolated surrogates and the 'ignore' handler
5858 is used. */
5859 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5860 if (nsize != PyBytes_GET_SIZE(v))
5861 _PyBytes_Resize(&v, nsize);
5862 Py_XDECREF(errorHandler);
5863 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005864 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005865 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005866 error:
5867 Py_XDECREF(rep);
5868 Py_XDECREF(errorHandler);
5869 Py_XDECREF(exc);
5870 Py_XDECREF(v);
5871 return NULL;
5872#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873}
5874
Alexander Belopolsky40018472011-02-26 01:02:56 +00005875PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5877 Py_ssize_t size,
5878 const char *errors,
5879 int byteorder)
5880{
5881 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005882 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 if (tmp == NULL)
5884 return NULL;
5885 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5886 Py_DECREF(tmp);
5887 return result;
5888}
5889
5890PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005891PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894}
5895
5896/* --- Unicode Escape Codec ----------------------------------------------- */
5897
Fredrik Lundh06d12682001-01-24 07:59:11 +00005898static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005899
Alexander Belopolsky40018472011-02-26 01:02:56 +00005900PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005901_PyUnicode_DecodeUnicodeEscape(const char *s,
5902 Py_ssize_t size,
5903 const char *errors,
5904 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 PyObject *errorHandler = NULL;
5910 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005911
Eric V. Smith42454af2016-10-31 09:22:08 -04005912 // so we can remember if we've seen an invalid escape char or not
5913 *first_invalid_escape = NULL;
5914
Victor Stinner62ec3312016-09-06 17:04:34 -07005915 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005916 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005917 }
5918 /* Escaped strings will always be longer than the resulting
5919 Unicode string, so we start with size here and then reduce the
5920 length after conversion to the true value.
5921 (but if the error callback returns a long replacement string
5922 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005923 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 writer.min_length = size;
5925 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5926 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927 }
5928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 end = s + size;
5930 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005931 unsigned char c = (unsigned char) *s++;
5932 Py_UCS4 ch;
5933 int count;
5934 Py_ssize_t startinpos;
5935 Py_ssize_t endinpos;
5936 const char *message;
5937
5938#define WRITE_ASCII_CHAR(ch) \
5939 do { \
5940 assert(ch <= 127); \
5941 assert(writer.pos < writer.size); \
5942 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5943 } while(0)
5944
5945#define WRITE_CHAR(ch) \
5946 do { \
5947 if (ch <= writer.maxchar) { \
5948 assert(writer.pos < writer.size); \
5949 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5950 } \
5951 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5952 goto onError; \
5953 } \
5954 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005957 if (c != '\\') {
5958 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 continue;
5960 }
5961
Victor Stinner62ec3312016-09-06 17:04:34 -07005962 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 if (s >= end) {
5965 message = "\\ at end of string";
5966 goto error;
5967 }
5968 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005971 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case '\n': continue;
5975 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5976 case '\'': WRITE_ASCII_CHAR('\''); continue;
5977 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5978 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005979 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005980 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5981 case 't': WRITE_ASCII_CHAR('\t'); continue;
5982 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5983 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005984 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 case '0': case '1': case '2': case '3':
5991 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005993 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 ch = (ch<<3) + *s++ - '0';
5995 if (s < end && '0' <= *s && *s <= '7') {
5996 ch = (ch<<3) + *s++ - '0';
5997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 WRITE_CHAR(ch);
6000 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 /* hex escapes */
6003 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 message = "truncated \\xXX escape";
6007 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 message = "truncated \\uXXXX escape";
6013 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006016 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006018 message = "truncated \\UXXXXXXXX escape";
6019 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006021 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 ch <<= 4;
6023 if (c >= '0' && c <= '9') {
6024 ch += c - '0';
6025 }
6026 else if (c >= 'a' && c <= 'f') {
6027 ch += c - ('a' - 10);
6028 }
6029 else if (c >= 'A' && c <= 'F') {
6030 ch += c - ('A' - 10);
6031 }
6032 else {
6033 break;
6034 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006035 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006037 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006038 }
6039
6040 /* when we get here, ch is a 32-bit unicode character */
6041 if (ch > MAX_UNICODE) {
6042 message = "illegal Unicode character";
6043 goto error;
6044 }
6045
6046 WRITE_CHAR(ch);
6047 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 if (ucnhash_CAPI == NULL) {
6052 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6054 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 if (ucnhash_CAPI == NULL) {
6056 PyErr_SetString(
6057 PyExc_UnicodeError,
6058 "\\N escapes not supported (can't load unicodedata module)"
6059 );
6060 goto onError;
6061 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006063
6064 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006065 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 const char *start = ++s;
6067 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 namelen = s - start;
6072 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 ch = 0xffffffff; /* in case 'getcode' messes up */
6076 if (namelen <= INT_MAX &&
6077 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6078 &ch, 0)) {
6079 assert(ch <= MAX_UNICODE);
6080 WRITE_CHAR(ch);
6081 continue;
6082 }
6083 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 }
6085 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006087
6088 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006089 if (*first_invalid_escape == NULL) {
6090 *first_invalid_escape = s-1; /* Back up one char, since we've
6091 already incremented s. */
6092 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006093 WRITE_ASCII_CHAR('\\');
6094 WRITE_CHAR(c);
6095 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097
6098 error:
6099 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006101 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006102 errors, &errorHandler,
6103 "unicodeescape", message,
6104 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 }
6108 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6109 goto onError;
6110 }
6111
6112#undef WRITE_ASCII_CHAR
6113#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006115
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006116 Py_XDECREF(errorHandler);
6117 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006121 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return NULL;
6125}
6126
Eric V. Smith42454af2016-10-31 09:22:08 -04006127PyObject *
6128PyUnicode_DecodeUnicodeEscape(const char *s,
6129 Py_ssize_t size,
6130 const char *errors)
6131{
6132 const char *first_invalid_escape;
6133 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6134 &first_invalid_escape);
6135 if (result == NULL)
6136 return NULL;
6137 if (first_invalid_escape != NULL) {
6138 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6139 "invalid escape sequence '\\%c'",
6140 *first_invalid_escape) < 0) {
6141 Py_DECREF(result);
6142 return NULL;
6143 }
6144 }
6145 return result;
6146}
6147
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006148/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Alexander Belopolsky40018472011-02-26 01:02:56 +00006150PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Ezio Melottie7f90372012-10-05 03:33:31 +03006160 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006161 escape.
6162
Ezio Melottie7f90372012-10-05 03:33:31 +03006163 For UCS1 strings it's '\xxx', 4 bytes per source character.
6164 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6165 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006166 */
6167
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 if (!PyUnicode_Check(unicode)) {
6169 PyErr_BadArgument();
6170 return NULL;
6171 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006174 }
Victor Stinner358af132015-10-12 22:36:57 +02006175
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 if (len == 0) {
6178 return PyBytes_FromStringAndSize(NULL, 0);
6179 }
6180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 kind = PyUnicode_KIND(unicode);
6182 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6184 bytes, and 1 byte characters 4. */
6185 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006186 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 return PyErr_NoMemory();
6188 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006189 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 if (repr == NULL) {
6191 return NULL;
6192 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006196 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 /* U+0000-U+00ff range */
6199 if (ch < 0x100) {
6200 if (ch >= ' ' && ch < 127) {
6201 if (ch != '\\') {
6202 /* Copy printable US ASCII as-is */
6203 *p++ = (char) ch;
6204 }
6205 /* Escape backslashes */
6206 else {
6207 *p++ = '\\';
6208 *p++ = '\\';
6209 }
6210 }
Victor Stinner358af132015-10-12 22:36:57 +02006211
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 /* Map special whitespace to '\t', \n', '\r' */
6213 else if (ch == '\t') {
6214 *p++ = '\\';
6215 *p++ = 't';
6216 }
6217 else if (ch == '\n') {
6218 *p++ = '\\';
6219 *p++ = 'n';
6220 }
6221 else if (ch == '\r') {
6222 *p++ = '\\';
6223 *p++ = 'r';
6224 }
6225
6226 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6227 else {
6228 *p++ = '\\';
6229 *p++ = 'x';
6230 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6231 *p++ = Py_hexdigits[ch & 0x000F];
6232 }
Tim Petersced69f82003-09-16 20:30:58 +00006233 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006234 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 *p++ = '\\';
6237 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006238 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6240 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6241 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6244 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006245
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 /* Make sure that the first two digits are zero */
6247 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 *p++ = 'U';
6250 *p++ = '0';
6251 *p++ = '0';
6252 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6257 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 assert(p - PyBytes_AS_STRING(repr) > 0);
6262 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6263 return NULL;
6264 }
6265 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266}
6267
Alexander Belopolsky40018472011-02-26 01:02:56 +00006268PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006269PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6270 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006273 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 }
6277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 result = PyUnicode_AsUnicodeEscapeString(tmp);
6279 Py_DECREF(tmp);
6280 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281}
6282
6283/* --- Raw Unicode Escape Codec ------------------------------------------- */
6284
Alexander Belopolsky40018472011-02-26 01:02:56 +00006285PyObject *
6286PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006287 Py_ssize_t size,
6288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006291 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 PyObject *errorHandler = NULL;
6294 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006297 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 /* Escaped strings will always be longer than the resulting
6301 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 length after conversion to the true value. (But decoding error
6303 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006304 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 writer.min_length = size;
6306 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6307 goto onError;
6308 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006309
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 end = s + size;
6311 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 unsigned char c = (unsigned char) *s++;
6313 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006314 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 Py_ssize_t startinpos;
6316 Py_ssize_t endinpos;
6317 const char *message;
6318
6319#define WRITE_CHAR(ch) \
6320 do { \
6321 if (ch <= writer.maxchar) { \
6322 assert(writer.pos < writer.size); \
6323 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6324 } \
6325 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6326 goto onError; \
6327 } \
6328 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 if (c != '\\' || s >= end) {
6332 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006335
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 c = (unsigned char) *s++;
6337 if (c == 'u') {
6338 count = 4;
6339 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 else if (c == 'U') {
6342 count = 8;
6343 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 }
6345 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 assert(writer.pos < writer.size);
6347 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6348 WRITE_CHAR(c);
6349 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006350 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 startinpos = s - starts - 2;
6352
6353 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6354 for (ch = 0; count && s < end; ++s, --count) {
6355 c = (unsigned char)*s;
6356 ch <<= 4;
6357 if (c >= '0' && c <= '9') {
6358 ch += c - '0';
6359 }
6360 else if (c >= 'a' && c <= 'f') {
6361 ch += c - ('a' - 10);
6362 }
6363 else if (c >= 'A' && c <= 'F') {
6364 ch += c - ('A' - 10);
6365 }
6366 else {
6367 break;
6368 }
6369 }
6370 if (!count) {
6371 if (ch <= MAX_UNICODE) {
6372 WRITE_CHAR(ch);
6373 continue;
6374 }
6375 message = "\\Uxxxxxxxx out of range";
6376 }
6377
6378 endinpos = s-starts;
6379 writer.min_length = end - s + writer.pos;
6380 if (unicode_decode_call_errorhandler_writer(
6381 errors, &errorHandler,
6382 "rawunicodeescape", message,
6383 &starts, &end, &startinpos, &endinpos, &exc, &s,
6384 &writer)) {
6385 goto onError;
6386 }
6387 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6388 goto onError;
6389 }
6390
6391#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403}
6404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408{
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 int kind;
6413 void *data;
6414 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 if (!PyUnicode_Check(unicode)) {
6417 PyErr_BadArgument();
6418 return NULL;
6419 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 kind = PyUnicode_KIND(unicode);
6424 data = PyUnicode_DATA(unicode);
6425 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 if (kind == PyUnicode_1BYTE_KIND) {
6427 return PyBytes_FromStringAndSize(data, len);
6428 }
Victor Stinner0e368262011-11-10 20:12:49 +01006429
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6431 bytes, and 1 byte characters 4. */
6432 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006433
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 if (len > PY_SSIZE_T_MAX / expandsize) {
6435 return PyErr_NoMemory();
6436 }
6437 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6438 if (repr == NULL) {
6439 return NULL;
6440 }
6441 if (len == 0) {
6442 return repr;
6443 }
6444
6445 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 for (pos = 0; pos < len; pos++) {
6447 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6450 if (ch < 0x100) {
6451 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006452 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6454 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 *p++ = '\\';
6456 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006457 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6460 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6463 else {
6464 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6465 *p++ = '\\';
6466 *p++ = 'U';
6467 *p++ = '0';
6468 *p++ = '0';
6469 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6474 *p++ = Py_hexdigits[ch & 15];
6475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006477
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 assert(p > PyBytes_AS_STRING(repr));
6479 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6480 return NULL;
6481 }
6482 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483}
6484
Alexander Belopolsky40018472011-02-26 01:02:56 +00006485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006490 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006492 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6494 Py_DECREF(tmp);
6495 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498/* --- Unicode Internal Codec ------------------------------------------- */
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
6501_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006502 Py_ssize_t size,
6503 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504{
6505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 Py_ssize_t startinpos;
6507 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006508 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006509 const char *end;
6510 const char *reason;
6511 PyObject *errorHandler = NULL;
6512 PyObject *exc = NULL;
6513
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006514 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006515 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 1))
6517 return NULL;
6518
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006519 if (size == 0)
6520 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006521
Victor Stinner8f674cc2013-04-17 23:02:17 +02006522 _PyUnicodeWriter_Init(&writer);
6523 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6524 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006526 }
6527 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006528
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006530 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006531 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006532 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006533 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006534 endinpos = end-starts;
6535 reason = "truncated input";
6536 goto error;
6537 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006538 /* We copy the raw representation one byte at a time because the
6539 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006540 ((char *) &uch)[0] = s[0];
6541 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006542#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 ((char *) &uch)[2] = s[2];
6544 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006547#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006548 /* We have to sanity check the raw data, otherwise doom looms for
6549 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006550 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551 endinpos = s - starts + Py_UNICODE_SIZE;
6552 reason = "illegal code point (> 0x10FFFF)";
6553 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556 s += Py_UNICODE_SIZE;
6557#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006558 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006560 Py_UNICODE uch2;
6561 ((char *) &uch2)[0] = s[0];
6562 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006563 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006564 {
Victor Stinner551ac952011-11-29 22:58:13 +01006565 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006567 }
6568 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006569#endif
6570
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006571 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006573 continue;
6574
6575 error:
6576 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006577 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006578 errors, &errorHandler,
6579 "unicode_internal", reason,
6580 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006581 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006582 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006583 }
6584
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006585 Py_XDECREF(errorHandler);
6586 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006587 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006590 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591 Py_XDECREF(errorHandler);
6592 Py_XDECREF(exc);
6593 return NULL;
6594}
6595
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596/* --- Latin-1 Codec ------------------------------------------------------ */
6597
Alexander Belopolsky40018472011-02-26 01:02:56 +00006598PyObject *
6599PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006600 Py_ssize_t size,
6601 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006604 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608static void
6609make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006610 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006611 PyObject *unicode,
6612 Py_ssize_t startpos, Py_ssize_t endpos,
6613 const char *reason)
6614{
6615 if (*exceptionObject == NULL) {
6616 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 encoding, unicode, startpos, endpos, reason);
6619 }
6620 else {
6621 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6622 goto onError;
6623 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6624 goto onError;
6625 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6626 goto onError;
6627 return;
6628 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006629 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006630 }
6631}
6632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006634static void
6635raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006636 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 PyObject *unicode,
6638 Py_ssize_t startpos, Py_ssize_t endpos,
6639 const char *reason)
6640{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006641 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006642 encoding, unicode, startpos, endpos, reason);
6643 if (*exceptionObject != NULL)
6644 PyCodec_StrictErrors(*exceptionObject);
6645}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646
6647/* error handling callback helper:
6648 build arguments, call the callback and check the arguments,
6649 put the result into newpos and return the replacement string, which
6650 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651static PyObject *
6652unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006653 PyObject **errorHandler,
6654 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 Py_ssize_t startpos, Py_ssize_t endpos,
6657 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006659 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 PyObject *restuple;
6662 PyObject *resunicode;
6663
6664 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 }
6669
Benjamin Petersonbac79492012-01-14 13:34:47 -05006670 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 return NULL;
6672 len = PyUnicode_GET_LENGTH(unicode);
6673
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006674 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006679 restuple = PyObject_CallFunctionObjArgs(
6680 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 Py_DECREF(restuple);
6686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006688 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 &resunicode, newpos)) {
6690 Py_DECREF(restuple);
6691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006693 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6694 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6695 Py_DECREF(restuple);
6696 return NULL;
6697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006699 *newpos = len + *newpos;
6700 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006701 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 Py_DECREF(restuple);
6703 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 Py_INCREF(resunicode);
6706 Py_DECREF(restuple);
6707 return resunicode;
6708}
6709
Alexander Belopolsky40018472011-02-26 01:02:56 +00006710static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006711unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006712 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006713 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 /* input state */
6716 Py_ssize_t pos=0, size;
6717 int kind;
6718 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 /* pointer into the output */
6720 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006721 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6722 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006723 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006725 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006726 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006727 /* output object */
6728 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729
Benjamin Petersonbac79492012-01-14 13:34:47 -05006730 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 return NULL;
6732 size = PyUnicode_GET_LENGTH(unicode);
6733 kind = PyUnicode_KIND(unicode);
6734 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 /* allocate enough for a simple encoding without
6736 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006737 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006738 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006739
6740 _PyBytesWriter_Init(&writer);
6741 str = _PyBytesWriter_Alloc(&writer, size);
6742 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006745 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006746 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006749 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006751 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006753 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006755 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006758 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006760
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006761 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006763
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006764 /* Only overallocate the buffer if it's not the last write */
6765 writer.overallocate = (collend < size);
6766
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006768 if (error_handler == _Py_ERROR_UNKNOWN)
6769 error_handler = get_error_handler(errors);
6770
6771 switch (error_handler) {
6772 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006773 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006775
6776 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006777 memset(str, '?', collend - collstart);
6778 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006779 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006780 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 break;
Victor Stinner50149202015-09-22 00:26:54 +02006783
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006784 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006785 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006786 writer.min_size -= (collend - collstart);
6787 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006789 if (str == NULL)
6790 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 pos = collend;
6792 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006793
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006794 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006795 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006796 writer.min_size -= (collend - collstart);
6797 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 unicode, collstart, collend);
6799 if (str == NULL)
6800 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 break;
Victor Stinner50149202015-09-22 00:26:54 +02006803
Victor Stinnerc3713e92015-09-29 12:32:13 +02006804 case _Py_ERROR_SURROGATEESCAPE:
6805 for (i = collstart; i < collend; ++i) {
6806 ch = PyUnicode_READ(kind, data, i);
6807 if (ch < 0xdc80 || 0xdcff < ch) {
6808 /* Not a UTF-8b surrogate */
6809 break;
6810 }
6811 *str++ = (char)(ch - 0xdc00);
6812 ++pos;
6813 }
6814 if (i >= collend)
6815 break;
6816 collstart = pos;
6817 assert(collstart != collend);
6818 /* fallback to general error handling */
6819
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006821 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6822 encoding, reason, unicode, &exc,
6823 collstart, collend, &newpos);
6824 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006826
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006827 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006828 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006829
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006831 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006832 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 PyBytes_AS_STRING(rep),
6834 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006835 if (str == NULL)
6836 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006837 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006838 else {
6839 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006840
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006844 if (limit == 256 ?
6845 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6846 !PyUnicode_IS_ASCII(rep))
6847 {
6848 /* Not all characters are smaller than limit */
6849 raise_encode_exception(&exc, encoding, unicode,
6850 collstart, collend, reason);
6851 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006853 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6854 str = _PyBytesWriter_WriteBytes(&writer, str,
6855 PyUnicode_DATA(rep),
6856 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006858 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006859 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006861
6862 /* If overallocation was disabled, ensure that it was the last
6863 write. Otherwise, we missed an optimization */
6864 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006865 }
6866 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006867
Victor Stinner50149202015-09-22 00:26:54 +02006868 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006870 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006871
6872 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006873 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006874 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006875 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006876 Py_XDECREF(exc);
6877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878}
6879
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006880/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881PyObject *
6882PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006883 Py_ssize_t size,
6884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006886 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006887 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 if (unicode == NULL)
6889 return NULL;
6890 result = unicode_encode_ucs1(unicode, errors, 256);
6891 Py_DECREF(unicode);
6892 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893}
6894
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006896_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
6898 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 PyErr_BadArgument();
6900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006902 if (PyUnicode_READY(unicode) == -1)
6903 return NULL;
6904 /* Fast path: if it is a one-byte string, construct
6905 bytes object directly. */
6906 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6907 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6908 PyUnicode_GET_LENGTH(unicode));
6909 /* Non-Latin-1 characters present. Defer to above function to
6910 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006911 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006912}
6913
6914PyObject*
6915PyUnicode_AsLatin1String(PyObject *unicode)
6916{
6917 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
6920/* --- 7-bit ASCII Codec -------------------------------------------------- */
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_DecodeASCII(const char *s,
6924 Py_ssize_t size,
6925 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006928 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006929 int kind;
6930 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t startinpos;
6932 Py_ssize_t endinpos;
6933 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006935 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006938
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006940 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006943 if (size == 1 && (unsigned char)s[0] < 128)
6944 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006945
Victor Stinner8f674cc2013-04-17 23:02:17 +02006946 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006947 writer.min_length = size;
6948 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006949 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006952 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006953 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006954 writer.pos = outpos;
6955 if (writer.pos == size)
6956 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006958 s += writer.pos;
6959 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006961 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006963 PyUnicode_WRITE(kind, data, writer.pos, c);
6964 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006966 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006968
6969 /* byte outsize range 0x00..0x7f: call the error handler */
6970
6971 if (error_handler == _Py_ERROR_UNKNOWN)
6972 error_handler = get_error_handler(errors);
6973
6974 switch (error_handler)
6975 {
6976 case _Py_ERROR_REPLACE:
6977 case _Py_ERROR_SURROGATEESCAPE:
6978 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006979 but we may switch to UCS2 at the first write */
6980 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6981 goto onError;
6982 kind = writer.kind;
6983 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006984
6985 if (error_handler == _Py_ERROR_REPLACE)
6986 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6987 else
6988 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6989 writer.pos++;
6990 ++s;
6991 break;
6992
6993 case _Py_ERROR_IGNORE:
6994 ++s;
6995 break;
6996
6997 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 startinpos = s-starts;
6999 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007000 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007001 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 "ascii", "ordinal not in range(128)",
7003 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 kind = writer.kind;
7007 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007010 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007011 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007013
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007016 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 return NULL;
7019}
7020
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007021/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007022PyObject *
7023PyUnicode_EncodeASCII(const Py_UNICODE *p,
7024 Py_ssize_t size,
7025 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007027 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007028 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 if (unicode == NULL)
7030 return NULL;
7031 result = unicode_encode_ucs1(unicode, errors, 128);
7032 Py_DECREF(unicode);
7033 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034}
7035
Alexander Belopolsky40018472011-02-26 01:02:56 +00007036PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
7039 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 PyErr_BadArgument();
7041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007043 if (PyUnicode_READY(unicode) == -1)
7044 return NULL;
7045 /* Fast path: if it is an ASCII-only string, construct bytes object
7046 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007047 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007048 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7049 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051}
7052
7053PyObject *
7054PyUnicode_AsASCIIString(PyObject *unicode)
7055{
7056 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057}
7058
Steve Dowercc16be82016-09-08 10:35:16 -07007059#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007060
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007061/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007062
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007063#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064#define NEED_RETRY
7065#endif
7066
Victor Stinner3a50e702011-10-18 21:21:00 +02007067#ifndef WC_ERR_INVALID_CHARS
7068# define WC_ERR_INVALID_CHARS 0x0080
7069#endif
7070
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007071static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007072code_page_name(UINT code_page, PyObject **obj)
7073{
7074 *obj = NULL;
7075 if (code_page == CP_ACP)
7076 return "mbcs";
7077 if (code_page == CP_UTF7)
7078 return "CP_UTF7";
7079 if (code_page == CP_UTF8)
7080 return "CP_UTF8";
7081
7082 *obj = PyBytes_FromFormat("cp%u", code_page);
7083 if (*obj == NULL)
7084 return NULL;
7085 return PyBytes_AS_STRING(*obj);
7086}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087
Victor Stinner3a50e702011-10-18 21:21:00 +02007088static DWORD
7089decode_code_page_flags(UINT code_page)
7090{
7091 if (code_page == CP_UTF7) {
7092 /* The CP_UTF7 decoder only supports flags=0 */
7093 return 0;
7094 }
7095 else
7096 return MB_ERR_INVALID_CHARS;
7097}
7098
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 * Decode a byte string from a Windows code page into unicode object in strict
7101 * mode.
7102 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007103 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7104 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007107decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007108 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 const char *in,
7110 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111{
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007113 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115
7116 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 assert(insize > 0);
7118 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7119 if (outsize <= 0)
7120 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121
7122 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007124 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007125 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 if (*v == NULL)
7127 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129 }
7130 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007133 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 }
7137
7138 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7140 if (outsize <= 0)
7141 goto error;
7142 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007143
Victor Stinner3a50e702011-10-18 21:21:00 +02007144error:
7145 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7146 return -2;
7147 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007148 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149}
7150
Victor Stinner3a50e702011-10-18 21:21:00 +02007151/*
7152 * Decode a byte string from a code page into unicode object with an error
7153 * handler.
7154 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007155 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 * UnicodeDecodeError exception and returns -1 on error.
7157 */
7158static int
7159decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007160 PyObject **v,
7161 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007162 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007163{
7164 const char *startin = in;
7165 const char *endin = in + size;
7166 const DWORD flags = decode_code_page_flags(code_page);
7167 /* Ideally, we should get reason from FormatMessage. This is the Windows
7168 2000 English version of the message. */
7169 const char *reason = "No mapping for the Unicode character exists "
7170 "in the target code page.";
7171 /* each step cannot decode more than 1 character, but a character can be
7172 represented as a surrogate pair */
7173 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007174 int insize;
7175 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 PyObject *errorHandler = NULL;
7177 PyObject *exc = NULL;
7178 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007179 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 DWORD err;
7181 int ret = -1;
7182
7183 assert(size > 0);
7184
7185 encoding = code_page_name(code_page, &encoding_obj);
7186 if (encoding == NULL)
7187 return -1;
7188
Victor Stinner7d00cc12014-03-17 23:08:06 +01007189 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7191 UnicodeDecodeError. */
7192 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7193 if (exc != NULL) {
7194 PyCodec_StrictErrors(exc);
7195 Py_CLEAR(exc);
7196 }
7197 goto error;
7198 }
7199
7200 if (*v == NULL) {
7201 /* Create unicode object */
7202 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7203 PyErr_NoMemory();
7204 goto error;
7205 }
Victor Stinnerab595942011-12-17 04:59:06 +01007206 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007207 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 if (*v == NULL)
7209 goto error;
7210 startout = PyUnicode_AS_UNICODE(*v);
7211 }
7212 else {
7213 /* Extend unicode object */
7214 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7215 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7216 PyErr_NoMemory();
7217 goto error;
7218 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007219 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 goto error;
7221 startout = PyUnicode_AS_UNICODE(*v) + n;
7222 }
7223
7224 /* Decode the byte string character per character */
7225 out = startout;
7226 while (in < endin)
7227 {
7228 /* Decode a character */
7229 insize = 1;
7230 do
7231 {
7232 outsize = MultiByteToWideChar(code_page, flags,
7233 in, insize,
7234 buffer, Py_ARRAY_LENGTH(buffer));
7235 if (outsize > 0)
7236 break;
7237 err = GetLastError();
7238 if (err != ERROR_NO_UNICODE_TRANSLATION
7239 && err != ERROR_INSUFFICIENT_BUFFER)
7240 {
7241 PyErr_SetFromWindowsErr(0);
7242 goto error;
7243 }
7244 insize++;
7245 }
7246 /* 4=maximum length of a UTF-8 sequence */
7247 while (insize <= 4 && (in + insize) <= endin);
7248
7249 if (outsize <= 0) {
7250 Py_ssize_t startinpos, endinpos, outpos;
7251
Victor Stinner7d00cc12014-03-17 23:08:06 +01007252 /* last character in partial decode? */
7253 if (in + insize >= endin && !final)
7254 break;
7255
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 startinpos = in - startin;
7257 endinpos = startinpos + 1;
7258 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007259 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 errors, &errorHandler,
7261 encoding, reason,
7262 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007263 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 {
7265 goto error;
7266 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007267 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 }
7269 else {
7270 in += insize;
7271 memcpy(out, buffer, outsize * sizeof(wchar_t));
7272 out += outsize;
7273 }
7274 }
7275
7276 /* write a NUL character at the end */
7277 *out = 0;
7278
7279 /* Extend unicode object */
7280 outsize = out - startout;
7281 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007282 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007284 /* (in - startin) <= size and size is an int */
7285 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007286
7287error:
7288 Py_XDECREF(encoding_obj);
7289 Py_XDECREF(errorHandler);
7290 Py_XDECREF(exc);
7291 return ret;
7292}
7293
Victor Stinner3a50e702011-10-18 21:21:00 +02007294static PyObject *
7295decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007296 const char *s, Py_ssize_t size,
7297 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298{
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 PyObject *v = NULL;
7300 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 if (code_page < 0) {
7303 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7304 return NULL;
7305 }
7306
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007307 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner76a31a62011-11-04 00:05:13 +01007310 do
7311 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007313 if (size > INT_MAX) {
7314 chunk_size = INT_MAX;
7315 final = 0;
7316 done = 0;
7317 }
7318 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 {
7321 chunk_size = (int)size;
7322 final = (consumed == NULL);
7323 done = 1;
7324 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 if (chunk_size == 0 && done) {
7327 if (v != NULL)
7328 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007329 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007330 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331
Victor Stinner76a31a62011-11-04 00:05:13 +01007332 converted = decode_code_page_strict(code_page, &v,
7333 s, chunk_size);
7334 if (converted == -2)
7335 converted = decode_code_page_errors(code_page, &v,
7336 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007337 errors, final);
7338 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007339
7340 if (converted < 0) {
7341 Py_XDECREF(v);
7342 return NULL;
7343 }
7344
7345 if (consumed)
7346 *consumed += converted;
7347
7348 s += converted;
7349 size -= converted;
7350 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007351
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007352 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353}
7354
Alexander Belopolsky40018472011-02-26 01:02:56 +00007355PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007356PyUnicode_DecodeCodePageStateful(int code_page,
7357 const char *s,
7358 Py_ssize_t size,
7359 const char *errors,
7360 Py_ssize_t *consumed)
7361{
7362 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7363}
7364
7365PyObject *
7366PyUnicode_DecodeMBCSStateful(const char *s,
7367 Py_ssize_t size,
7368 const char *errors,
7369 Py_ssize_t *consumed)
7370{
7371 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7372}
7373
7374PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007375PyUnicode_DecodeMBCS(const char *s,
7376 Py_ssize_t size,
7377 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007378{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007379 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7380}
7381
Victor Stinner3a50e702011-10-18 21:21:00 +02007382static DWORD
7383encode_code_page_flags(UINT code_page, const char *errors)
7384{
7385 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007386 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 }
7388 else if (code_page == CP_UTF7) {
7389 /* CP_UTF7 only supports flags=0 */
7390 return 0;
7391 }
7392 else {
7393 if (errors != NULL && strcmp(errors, "replace") == 0)
7394 return 0;
7395 else
7396 return WC_NO_BEST_FIT_CHARS;
7397 }
7398}
7399
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 * Encode a Unicode string to a Windows code page into a byte string in strict
7402 * mode.
7403 *
7404 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007405 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007407static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007408encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411{
Victor Stinner554f3f02010-06-16 23:33:54 +00007412 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 BOOL *pusedDefaultChar = &usedDefaultChar;
7414 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007415 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const DWORD flags = encode_code_page_flags(code_page, NULL);
7418 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007419 /* Create a substring so that we can get the UTF-16 representation
7420 of just the slice under consideration. */
7421 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422
Martin v. Löwis3d325192011-11-04 18:23:06 +01007423 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007426 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007428 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007429
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 substring = PyUnicode_Substring(unicode, offset, offset+len);
7431 if (substring == NULL)
7432 return -1;
7433 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7434 if (p == NULL) {
7435 Py_DECREF(substring);
7436 return -1;
7437 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007438 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007440 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007442 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 NULL, 0,
7444 NULL, pusedDefaultChar);
7445 if (outsize <= 0)
7446 goto error;
7447 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 if (pusedDefaultChar && *pusedDefaultChar) {
7449 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007451 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007452
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 if (*outbytes == NULL) {
7457 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461 }
7462 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 const Py_ssize_t n = PyBytes_Size(*outbytes);
7465 if (outsize > PY_SSIZE_T_MAX - n) {
7466 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7471 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475 }
7476
7477 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007479 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 out, outsize,
7481 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 if (outsize <= 0)
7484 goto error;
7485 if (pusedDefaultChar && *pusedDefaultChar)
7486 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007487 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007488
Victor Stinner3a50e702011-10-18 21:21:00 +02007489error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7492 return -2;
7493 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007494 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007495}
7496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007498 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 * error handler.
7500 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007501 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 * -1 on other error.
7503 */
7504static int
7505encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007506 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007508{
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007510 Py_ssize_t pos = unicode_offset;
7511 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 /* Ideally, we should get reason from FormatMessage. This is the Windows
7513 2000 English version of the message. */
7514 const char *reason = "invalid character";
7515 /* 4=maximum length of a UTF-8 sequence */
7516 char buffer[4];
7517 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7518 Py_ssize_t outsize;
7519 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 PyObject *errorHandler = NULL;
7521 PyObject *exc = NULL;
7522 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007523 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007524 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 PyObject *rep;
7526 int ret = -1;
7527
7528 assert(insize > 0);
7529
7530 encoding = code_page_name(code_page, &encoding_obj);
7531 if (encoding == NULL)
7532 return -1;
7533
7534 if (errors == NULL || strcmp(errors, "strict") == 0) {
7535 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7536 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007537 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 if (exc != NULL) {
7539 PyCodec_StrictErrors(exc);
7540 Py_DECREF(exc);
7541 }
7542 Py_XDECREF(encoding_obj);
7543 return -1;
7544 }
7545
7546 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7547 pusedDefaultChar = &usedDefaultChar;
7548 else
7549 pusedDefaultChar = NULL;
7550
7551 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7552 PyErr_NoMemory();
7553 goto error;
7554 }
7555 outsize = insize * Py_ARRAY_LENGTH(buffer);
7556
7557 if (*outbytes == NULL) {
7558 /* Create string object */
7559 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7560 if (*outbytes == NULL)
7561 goto error;
7562 out = PyBytes_AS_STRING(*outbytes);
7563 }
7564 else {
7565 /* Extend string object */
7566 Py_ssize_t n = PyBytes_Size(*outbytes);
7567 if (n > PY_SSIZE_T_MAX - outsize) {
7568 PyErr_NoMemory();
7569 goto error;
7570 }
7571 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7572 goto error;
7573 out = PyBytes_AS_STRING(*outbytes) + n;
7574 }
7575
7576 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007577 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007579 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7580 wchar_t chars[2];
7581 int charsize;
7582 if (ch < 0x10000) {
7583 chars[0] = (wchar_t)ch;
7584 charsize = 1;
7585 }
7586 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007587 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7588 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007589 charsize = 2;
7590 }
7591
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 buffer, Py_ARRAY_LENGTH(buffer),
7595 NULL, pusedDefaultChar);
7596 if (outsize > 0) {
7597 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7598 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 memcpy(out, buffer, outsize);
7601 out += outsize;
7602 continue;
7603 }
7604 }
7605 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7606 PyErr_SetFromWindowsErr(0);
7607 goto error;
7608 }
7609
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 rep = unicode_encode_call_errorhandler(
7611 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007612 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007613 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 if (rep == NULL)
7615 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007616 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007617
7618 if (PyBytes_Check(rep)) {
7619 outsize = PyBytes_GET_SIZE(rep);
7620 if (outsize != 1) {
7621 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7622 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7623 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7624 Py_DECREF(rep);
7625 goto error;
7626 }
7627 out = PyBytes_AS_STRING(*outbytes) + offset;
7628 }
7629 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7630 out += outsize;
7631 }
7632 else {
7633 Py_ssize_t i;
7634 enum PyUnicode_Kind kind;
7635 void *data;
7636
Benjamin Petersonbac79492012-01-14 13:34:47 -05007637 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 Py_DECREF(rep);
7639 goto error;
7640 }
7641
7642 outsize = PyUnicode_GET_LENGTH(rep);
7643 if (outsize != 1) {
7644 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7645 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7646 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7647 Py_DECREF(rep);
7648 goto error;
7649 }
7650 out = PyBytes_AS_STRING(*outbytes) + offset;
7651 }
7652 kind = PyUnicode_KIND(rep);
7653 data = PyUnicode_DATA(rep);
7654 for (i=0; i < outsize; i++) {
7655 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7656 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007657 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 encoding, unicode,
7659 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 "unable to encode error handler result to ASCII");
7661 Py_DECREF(rep);
7662 goto error;
7663 }
7664 *out = (unsigned char)ch;
7665 out++;
7666 }
7667 }
7668 Py_DECREF(rep);
7669 }
7670 /* write a NUL byte */
7671 *out = 0;
7672 outsize = out - PyBytes_AS_STRING(*outbytes);
7673 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7674 if (_PyBytes_Resize(outbytes, outsize) < 0)
7675 goto error;
7676 ret = 0;
7677
7678error:
7679 Py_XDECREF(encoding_obj);
7680 Py_XDECREF(errorHandler);
7681 Py_XDECREF(exc);
7682 return ret;
7683}
7684
Victor Stinner3a50e702011-10-18 21:21:00 +02007685static PyObject *
7686encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007687 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007688 const char *errors)
7689{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007690 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007691 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007692 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007693 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007694
Victor Stinner29dacf22015-01-26 16:41:32 +01007695 if (!PyUnicode_Check(unicode)) {
7696 PyErr_BadArgument();
7697 return NULL;
7698 }
7699
Benjamin Petersonbac79492012-01-14 13:34:47 -05007700 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007701 return NULL;
7702 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007703
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 if (code_page < 0) {
7705 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7706 return NULL;
7707 }
7708
Martin v. Löwis3d325192011-11-04 18:23:06 +01007709 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007710 return PyBytes_FromStringAndSize(NULL, 0);
7711
Victor Stinner7581cef2011-11-03 22:32:33 +01007712 offset = 0;
7713 do
7714 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007715#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007716 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007717 chunks. */
7718 if (len > INT_MAX/2) {
7719 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007720 done = 0;
7721 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007722 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007724 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 done = 1;
7727 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007728
Victor Stinner76a31a62011-11-04 00:05:13 +01007729 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 errors);
7732 if (ret == -2)
7733 ret = encode_code_page_errors(code_page, &outbytes,
7734 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007735 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 if (ret < 0) {
7737 Py_XDECREF(outbytes);
7738 return NULL;
7739 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007740
Victor Stinner7581cef2011-11-03 22:32:33 +01007741 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007743 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007744
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 return outbytes;
7746}
7747
7748PyObject *
7749PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7750 Py_ssize_t size,
7751 const char *errors)
7752{
Victor Stinner7581cef2011-11-03 22:32:33 +01007753 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007754 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007755 if (unicode == NULL)
7756 return NULL;
7757 res = encode_code_page(CP_ACP, unicode, errors);
7758 Py_DECREF(unicode);
7759 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007760}
7761
7762PyObject *
7763PyUnicode_EncodeCodePage(int code_page,
7764 PyObject *unicode,
7765 const char *errors)
7766{
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007768}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007769
Alexander Belopolsky40018472011-02-26 01:02:56 +00007770PyObject *
7771PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007772{
Victor Stinner7581cef2011-11-03 22:32:33 +01007773 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007774}
7775
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007776#undef NEED_RETRY
7777
Steve Dowercc16be82016-09-08 10:35:16 -07007778#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007779
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780/* --- Character Mapping Codec -------------------------------------------- */
7781
Victor Stinnerfb161b12013-04-18 01:44:27 +02007782static int
7783charmap_decode_string(const char *s,
7784 Py_ssize_t size,
7785 PyObject *mapping,
7786 const char *errors,
7787 _PyUnicodeWriter *writer)
7788{
7789 const char *starts = s;
7790 const char *e;
7791 Py_ssize_t startinpos, endinpos;
7792 PyObject *errorHandler = NULL, *exc = NULL;
7793 Py_ssize_t maplen;
7794 enum PyUnicode_Kind mapkind;
7795 void *mapdata;
7796 Py_UCS4 x;
7797 unsigned char ch;
7798
7799 if (PyUnicode_READY(mapping) == -1)
7800 return -1;
7801
7802 maplen = PyUnicode_GET_LENGTH(mapping);
7803 mapdata = PyUnicode_DATA(mapping);
7804 mapkind = PyUnicode_KIND(mapping);
7805
7806 e = s + size;
7807
7808 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7809 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7810 * is disabled in encoding aliases, latin1 is preferred because
7811 * its implementation is faster. */
7812 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7813 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7814 Py_UCS4 maxchar = writer->maxchar;
7815
7816 assert (writer->kind == PyUnicode_1BYTE_KIND);
7817 while (s < e) {
7818 ch = *s;
7819 x = mapdata_ucs1[ch];
7820 if (x > maxchar) {
7821 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7822 goto onError;
7823 maxchar = writer->maxchar;
7824 outdata = (Py_UCS1 *)writer->data;
7825 }
7826 outdata[writer->pos] = x;
7827 writer->pos++;
7828 ++s;
7829 }
7830 return 0;
7831 }
7832
7833 while (s < e) {
7834 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7835 enum PyUnicode_Kind outkind = writer->kind;
7836 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7837 if (outkind == PyUnicode_1BYTE_KIND) {
7838 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7839 Py_UCS4 maxchar = writer->maxchar;
7840 while (s < e) {
7841 ch = *s;
7842 x = mapdata_ucs2[ch];
7843 if (x > maxchar)
7844 goto Error;
7845 outdata[writer->pos] = x;
7846 writer->pos++;
7847 ++s;
7848 }
7849 break;
7850 }
7851 else if (outkind == PyUnicode_2BYTE_KIND) {
7852 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7853 while (s < e) {
7854 ch = *s;
7855 x = mapdata_ucs2[ch];
7856 if (x == 0xFFFE)
7857 goto Error;
7858 outdata[writer->pos] = x;
7859 writer->pos++;
7860 ++s;
7861 }
7862 break;
7863 }
7864 }
7865 ch = *s;
7866
7867 if (ch < maplen)
7868 x = PyUnicode_READ(mapkind, mapdata, ch);
7869 else
7870 x = 0xfffe; /* invalid value */
7871Error:
7872 if (x == 0xfffe)
7873 {
7874 /* undefined mapping */
7875 startinpos = s-starts;
7876 endinpos = startinpos+1;
7877 if (unicode_decode_call_errorhandler_writer(
7878 errors, &errorHandler,
7879 "charmap", "character maps to <undefined>",
7880 &starts, &e, &startinpos, &endinpos, &exc, &s,
7881 writer)) {
7882 goto onError;
7883 }
7884 continue;
7885 }
7886
7887 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7888 goto onError;
7889 ++s;
7890 }
7891 Py_XDECREF(errorHandler);
7892 Py_XDECREF(exc);
7893 return 0;
7894
7895onError:
7896 Py_XDECREF(errorHandler);
7897 Py_XDECREF(exc);
7898 return -1;
7899}
7900
7901static int
7902charmap_decode_mapping(const char *s,
7903 Py_ssize_t size,
7904 PyObject *mapping,
7905 const char *errors,
7906 _PyUnicodeWriter *writer)
7907{
7908 const char *starts = s;
7909 const char *e;
7910 Py_ssize_t startinpos, endinpos;
7911 PyObject *errorHandler = NULL, *exc = NULL;
7912 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007913 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007914
7915 e = s + size;
7916
7917 while (s < e) {
7918 ch = *s;
7919
7920 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7921 key = PyLong_FromLong((long)ch);
7922 if (key == NULL)
7923 goto onError;
7924
7925 item = PyObject_GetItem(mapping, key);
7926 Py_DECREF(key);
7927 if (item == NULL) {
7928 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7929 /* No mapping found means: mapping is undefined. */
7930 PyErr_Clear();
7931 goto Undefined;
7932 } else
7933 goto onError;
7934 }
7935
7936 /* Apply mapping */
7937 if (item == Py_None)
7938 goto Undefined;
7939 if (PyLong_Check(item)) {
7940 long value = PyLong_AS_LONG(item);
7941 if (value == 0xFFFE)
7942 goto Undefined;
7943 if (value < 0 || value > MAX_UNICODE) {
7944 PyErr_Format(PyExc_TypeError,
7945 "character mapping must be in range(0x%lx)",
7946 (unsigned long)MAX_UNICODE + 1);
7947 goto onError;
7948 }
7949
7950 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7951 goto onError;
7952 }
7953 else if (PyUnicode_Check(item)) {
7954 if (PyUnicode_READY(item) == -1)
7955 goto onError;
7956 if (PyUnicode_GET_LENGTH(item) == 1) {
7957 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7958 if (value == 0xFFFE)
7959 goto Undefined;
7960 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7961 goto onError;
7962 }
7963 else {
7964 writer->overallocate = 1;
7965 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7966 goto onError;
7967 }
7968 }
7969 else {
7970 /* wrong return value */
7971 PyErr_SetString(PyExc_TypeError,
7972 "character mapping must return integer, None or str");
7973 goto onError;
7974 }
7975 Py_CLEAR(item);
7976 ++s;
7977 continue;
7978
7979Undefined:
7980 /* undefined mapping */
7981 Py_CLEAR(item);
7982 startinpos = s-starts;
7983 endinpos = startinpos+1;
7984 if (unicode_decode_call_errorhandler_writer(
7985 errors, &errorHandler,
7986 "charmap", "character maps to <undefined>",
7987 &starts, &e, &startinpos, &endinpos, &exc, &s,
7988 writer)) {
7989 goto onError;
7990 }
7991 }
7992 Py_XDECREF(errorHandler);
7993 Py_XDECREF(exc);
7994 return 0;
7995
7996onError:
7997 Py_XDECREF(item);
7998 Py_XDECREF(errorHandler);
7999 Py_XDECREF(exc);
8000 return -1;
8001}
8002
Alexander Belopolsky40018472011-02-26 01:02:56 +00008003PyObject *
8004PyUnicode_DecodeCharmap(const char *s,
8005 Py_ssize_t size,
8006 PyObject *mapping,
8007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008009 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008010
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 /* Default to Latin-1 */
8012 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008016 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008017 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008018 writer.min_length = size;
8019 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008021
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008022 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008023 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8024 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008025 }
8026 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008027 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8028 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008030 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008031
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008033 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 return NULL;
8035}
8036
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037/* Charmap encoding: the lookup table */
8038
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 PyObject_HEAD
8041 unsigned char level1[32];
8042 int count2, count3;
8043 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044};
8045
8046static PyObject*
8047encoding_map_size(PyObject *obj, PyObject* args)
8048{
8049 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052}
8053
8054static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 PyDoc_STR("Return the size (in bytes) of this object") },
8057 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058};
8059
8060static void
8061encoding_map_dealloc(PyObject* o)
8062{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064}
8065
8066static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 "EncodingMap", /*tp_name*/
8069 sizeof(struct encoding_map), /*tp_basicsize*/
8070 0, /*tp_itemsize*/
8071 /* methods */
8072 encoding_map_dealloc, /*tp_dealloc*/
8073 0, /*tp_print*/
8074 0, /*tp_getattr*/
8075 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008076 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 0, /*tp_repr*/
8078 0, /*tp_as_number*/
8079 0, /*tp_as_sequence*/
8080 0, /*tp_as_mapping*/
8081 0, /*tp_hash*/
8082 0, /*tp_call*/
8083 0, /*tp_str*/
8084 0, /*tp_getattro*/
8085 0, /*tp_setattro*/
8086 0, /*tp_as_buffer*/
8087 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8088 0, /*tp_doc*/
8089 0, /*tp_traverse*/
8090 0, /*tp_clear*/
8091 0, /*tp_richcompare*/
8092 0, /*tp_weaklistoffset*/
8093 0, /*tp_iter*/
8094 0, /*tp_iternext*/
8095 encoding_map_methods, /*tp_methods*/
8096 0, /*tp_members*/
8097 0, /*tp_getset*/
8098 0, /*tp_base*/
8099 0, /*tp_dict*/
8100 0, /*tp_descr_get*/
8101 0, /*tp_descr_set*/
8102 0, /*tp_dictoffset*/
8103 0, /*tp_init*/
8104 0, /*tp_alloc*/
8105 0, /*tp_new*/
8106 0, /*tp_free*/
8107 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108};
8109
8110PyObject*
8111PyUnicode_BuildEncodingMap(PyObject* string)
8112{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 PyObject *result;
8114 struct encoding_map *mresult;
8115 int i;
8116 int need_dict = 0;
8117 unsigned char level1[32];
8118 unsigned char level2[512];
8119 unsigned char *mlevel1, *mlevel2, *mlevel3;
8120 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 int kind;
8122 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008123 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008126 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127 PyErr_BadArgument();
8128 return NULL;
8129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 kind = PyUnicode_KIND(string);
8131 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 length = PyUnicode_GET_LENGTH(string);
8133 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 memset(level1, 0xFF, sizeof level1);
8135 memset(level2, 0xFF, sizeof level2);
8136
8137 /* If there isn't a one-to-one mapping of NULL to \0,
8138 or if there are non-BMP characters, we need to use
8139 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008142 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 ch = PyUnicode_READ(kind, data, i);
8145 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 need_dict = 1;
8147 break;
8148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 /* unmapped character */
8151 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 l1 = ch >> 11;
8153 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 if (level1[l1] == 0xFF)
8155 level1[l1] = count2++;
8156 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 }
8159
8160 if (count2 >= 0xFF || count3 >= 0xFF)
8161 need_dict = 1;
8162
8163 if (need_dict) {
8164 PyObject *result = PyDict_New();
8165 PyObject *key, *value;
8166 if (!result)
8167 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008170 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 if (!key || !value)
8172 goto failed1;
8173 if (PyDict_SetItem(result, key, value) == -1)
8174 goto failed1;
8175 Py_DECREF(key);
8176 Py_DECREF(value);
8177 }
8178 return result;
8179 failed1:
8180 Py_XDECREF(key);
8181 Py_XDECREF(value);
8182 Py_DECREF(result);
8183 return NULL;
8184 }
8185
8186 /* Create a three-level trie */
8187 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8188 16*count2 + 128*count3 - 1);
8189 if (!result)
8190 return PyErr_NoMemory();
8191 PyObject_Init(result, &EncodingMapType);
8192 mresult = (struct encoding_map*)result;
8193 mresult->count2 = count2;
8194 mresult->count3 = count3;
8195 mlevel1 = mresult->level1;
8196 mlevel2 = mresult->level23;
8197 mlevel3 = mresult->level23 + 16*count2;
8198 memcpy(mlevel1, level1, 32);
8199 memset(mlevel2, 0xFF, 16*count2);
8200 memset(mlevel3, 0, 128*count3);
8201 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008202 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008204 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8205 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008206 /* unmapped character */
8207 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008208 o1 = ch>>11;
8209 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 i2 = 16*mlevel1[o1] + o2;
8211 if (mlevel2[i2] == 0xFF)
8212 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008213 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 i3 = 128*mlevel2[i2] + o3;
8215 mlevel3[i3] = i;
8216 }
8217 return result;
8218}
8219
8220static int
Victor Stinner22168992011-11-20 17:09:18 +01008221encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222{
8223 struct encoding_map *map = (struct encoding_map*)mapping;
8224 int l1 = c>>11;
8225 int l2 = (c>>7) & 0xF;
8226 int l3 = c & 0x7F;
8227 int i;
8228
Victor Stinner22168992011-11-20 17:09:18 +01008229 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231 if (c == 0)
8232 return 0;
8233 /* level 1*/
8234 i = map->level1[l1];
8235 if (i == 0xFF) {
8236 return -1;
8237 }
8238 /* level 2*/
8239 i = map->level23[16*i+l2];
8240 if (i == 0xFF) {
8241 return -1;
8242 }
8243 /* level 3 */
8244 i = map->level23[16*map->count2 + 128*i + l3];
8245 if (i == 0) {
8246 return -1;
8247 }
8248 return i;
8249}
8250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251/* Lookup the character ch in the mapping. If the character
8252 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008253 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008254static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008255charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256{
Christian Heimes217cfd12007-12-02 14:31:20 +00008257 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 PyObject *x;
8259
8260 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 x = PyObject_GetItem(mapping, w);
8263 Py_DECREF(w);
8264 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8266 /* No mapping found means: mapping is undefined. */
8267 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008268 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 } else
8270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008272 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008274 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 long value = PyLong_AS_LONG(x);
8276 if (value < 0 || value > 255) {
8277 PyErr_SetString(PyExc_TypeError,
8278 "character mapping must be in range(256)");
8279 Py_DECREF(x);
8280 return NULL;
8281 }
8282 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008284 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 /* wrong return value */
8288 PyErr_Format(PyExc_TypeError,
8289 "character mapping must return integer, bytes or None, not %.400s",
8290 x->ob_type->tp_name);
8291 Py_DECREF(x);
8292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
8294}
8295
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008297charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008298{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8300 /* exponentially overallocate to minimize reallocations */
8301 if (requiredsize < 2*outsize)
8302 requiredsize = 2*outsize;
8303 if (_PyBytes_Resize(outobj, requiredsize))
8304 return -1;
8305 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306}
8307
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008312 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 space is available. Return a new reference to the object that
8314 was put in the output buffer, or Py_None, if the mapping was undefined
8315 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008316 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008318charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 PyObject *rep;
8322 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008323 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324
Christian Heimes90aa7642007-12-19 02:45:37 +00008325 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 if (res == -1)
8329 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 if (outsize<requiredsize)
8331 if (charmapencode_resize(outobj, outpos, requiredsize))
8332 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008333 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 outstart[(*outpos)++] = (char)res;
8335 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336 }
8337
8338 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 Py_DECREF(rep);
8343 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 if (PyLong_Check(rep)) {
8346 Py_ssize_t requiredsize = *outpos+1;
8347 if (outsize<requiredsize)
8348 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8349 Py_DECREF(rep);
8350 return enc_EXCEPTION;
8351 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008352 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008354 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 else {
8356 const char *repchars = PyBytes_AS_STRING(rep);
8357 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8358 Py_ssize_t requiredsize = *outpos+repsize;
8359 if (outsize<requiredsize)
8360 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8361 Py_DECREF(rep);
8362 return enc_EXCEPTION;
8363 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008364 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 memcpy(outstart + *outpos, repchars, repsize);
8366 *outpos += repsize;
8367 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369 Py_DECREF(rep);
8370 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371}
8372
8373/* handle an error in PyUnicode_EncodeCharmap
8374 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008375static int
8376charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008377 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008379 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008380 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381{
8382 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008384 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008385 enum PyUnicode_Kind kind;
8386 void *data;
8387 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008389 Py_ssize_t collstartpos = *inpos;
8390 Py_ssize_t collendpos = *inpos+1;
8391 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 char *encoding = "charmap";
8393 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008394 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008395 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008396 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397
Benjamin Petersonbac79492012-01-14 13:34:47 -05008398 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 return -1;
8400 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 /* find all unencodable characters */
8402 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008403 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008404 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008406 val = encoding_map_lookup(ch, mapping);
8407 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 break;
8409 ++collendpos;
8410 continue;
8411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8414 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 if (rep==NULL)
8416 return -1;
8417 else if (rep!=Py_None) {
8418 Py_DECREF(rep);
8419 break;
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 }
8424 /* cache callback name lookup
8425 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008426 if (*error_handler == _Py_ERROR_UNKNOWN)
8427 *error_handler = get_error_handler(errors);
8428
8429 switch (*error_handler) {
8430 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008431 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008433
8434 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 x = charmapencode_output('?', mapping, res, respos);
8437 if (x==enc_EXCEPTION) {
8438 return -1;
8439 }
8440 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008441 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
8443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 }
8445 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008446 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 *inpos = collendpos;
8448 break;
Victor Stinner50149202015-09-22 00:26:54 +02008449
8450 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 /* generate replacement (temporarily (mis)uses p) */
8452 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 char buffer[2+29+1+1];
8454 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008455 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 for (cp = buffer; *cp; ++cp) {
8457 x = charmapencode_output(*cp, mapping, res, respos);
8458 if (x==enc_EXCEPTION)
8459 return -1;
8460 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008461 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return -1;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 }
8465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008466 *inpos = collendpos;
8467 break;
Victor Stinner50149202015-09-22 00:26:54 +02008468
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 default:
Victor Stinner50149202015-09-22 00:26:54 +02008470 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008471 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008475 if (PyBytes_Check(repunicode)) {
8476 /* Directly copy bytes result to output. */
8477 Py_ssize_t outsize = PyBytes_Size(*res);
8478 Py_ssize_t requiredsize;
8479 repsize = PyBytes_Size(repunicode);
8480 requiredsize = *respos + repsize;
8481 if (requiredsize > outsize)
8482 /* Make room for all additional bytes. */
8483 if (charmapencode_resize(res, respos, requiredsize)) {
8484 Py_DECREF(repunicode);
8485 return -1;
8486 }
8487 memcpy(PyBytes_AsString(*res) + *respos,
8488 PyBytes_AsString(repunicode), repsize);
8489 *respos += repsize;
8490 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008491 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008492 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008495 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008496 Py_DECREF(repunicode);
8497 return -1;
8498 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008499 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008500 data = PyUnicode_DATA(repunicode);
8501 kind = PyUnicode_KIND(repunicode);
8502 for (index = 0; index < repsize; index++) {
8503 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8504 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008506 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 return -1;
8508 }
8509 else if (x==enc_FAILED) {
8510 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008511 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 return -1;
8513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 }
8515 *inpos = newpos;
8516 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
8518 return 0;
8519}
8520
Alexander Belopolsky40018472011-02-26 01:02:56 +00008521PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522_PyUnicode_EncodeCharmap(PyObject *unicode,
8523 PyObject *mapping,
8524 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 /* output object */
8527 PyObject *res = NULL;
8528 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008529 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008532 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008533 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008535 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008536 void *data;
8537 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
Benjamin Petersonbac79492012-01-14 13:34:47 -05008539 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 return NULL;
8541 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008542 data = PyUnicode_DATA(unicode);
8543 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 /* Default to Latin-1 */
8546 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 /* allocate enough for a simple encoding without
8550 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008551 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 if (res == NULL)
8553 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008554 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008558 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 if (x==enc_EXCEPTION) /* error */
8562 goto onError;
8563 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008566 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 &res, &respos)) {
8568 goto onError;
8569 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 else
8572 /* done with this character => adjust input position */
8573 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008577 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008578 if (_PyBytes_Resize(&res, respos) < 0)
8579 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008582 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 return res;
8584
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 Py_XDECREF(res);
8587 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008588 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 return NULL;
8590}
8591
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008592/* Deprecated */
8593PyObject *
8594PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8595 Py_ssize_t size,
8596 PyObject *mapping,
8597 const char *errors)
8598{
8599 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008600 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 if (unicode == NULL)
8602 return NULL;
8603 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8604 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008605 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606}
8607
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608PyObject *
8609PyUnicode_AsCharmapString(PyObject *unicode,
8610 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
8612 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 PyErr_BadArgument();
8614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008616 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617}
8618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620static void
8621make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008623 Py_ssize_t startpos, Py_ssize_t endpos,
8624 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 *exceptionObject = _PyUnicodeTranslateError_Create(
8628 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8632 goto onError;
8633 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8634 goto onError;
8635 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8636 goto onError;
8637 return;
8638 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008639 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
8641}
8642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643/* error handling callback helper:
8644 build arguments, call the callback and check the arguments,
8645 put the result into newpos and return the replacement string, which
8646 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008647static PyObject *
8648unicode_translate_call_errorhandler(const char *errors,
8649 PyObject **errorHandler,
8650 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008652 Py_ssize_t startpos, Py_ssize_t endpos,
8653 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008655 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008657 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 PyObject *restuple;
8659 PyObject *resunicode;
8660
8661 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 }
8666
8667 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008672 restuple = PyObject_CallFunctionObjArgs(
8673 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008677 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 Py_DECREF(restuple);
8679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008681 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 &resunicode, &i_newpos)) {
8683 Py_DECREF(restuple);
8684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008686 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008688 else
8689 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008691 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 Py_DECREF(restuple);
8693 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 Py_INCREF(resunicode);
8696 Py_DECREF(restuple);
8697 return resunicode;
8698}
8699
8700/* Lookup the character ch in the mapping and put the result in result,
8701 which must be decrefed by the caller.
8702 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008703static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705{
Christian Heimes217cfd12007-12-02 14:31:20 +00008706 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 PyObject *x;
8708
8709 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 x = PyObject_GetItem(mapping, w);
8712 Py_DECREF(w);
8713 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8715 /* No mapping found means: use 1:1 mapping. */
8716 PyErr_Clear();
8717 *result = NULL;
8718 return 0;
8719 } else
8720 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 }
8722 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 *result = x;
8724 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008726 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008728 if (value < 0 || value > MAX_UNICODE) {
8729 PyErr_Format(PyExc_ValueError,
8730 "character mapping must be in range(0x%x)",
8731 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 Py_DECREF(x);
8733 return -1;
8734 }
8735 *result = x;
8736 return 0;
8737 }
8738 else if (PyUnicode_Check(x)) {
8739 *result = x;
8740 return 0;
8741 }
8742 else {
8743 /* wrong return value */
8744 PyErr_SetString(PyExc_TypeError,
8745 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008746 Py_DECREF(x);
8747 return -1;
8748 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749}
Victor Stinner1194ea02014-04-04 19:37:40 +02008750
8751/* lookup the character, write the result into the writer.
8752 Return 1 if the result was written into the writer, return 0 if the mapping
8753 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008754static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008755charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8756 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757{
Victor Stinner1194ea02014-04-04 19:37:40 +02008758 PyObject *item;
8759
8760 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008762
8763 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008768 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008770
8771 if (item == Py_None) {
8772 Py_DECREF(item);
8773 return 0;
8774 }
8775
8776 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008777 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8778 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8779 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008780 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8781 Py_DECREF(item);
8782 return -1;
8783 }
8784 Py_DECREF(item);
8785 return 1;
8786 }
8787
8788 if (!PyUnicode_Check(item)) {
8789 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008791 }
8792
8793 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8794 Py_DECREF(item);
8795 return -1;
8796 }
8797
8798 Py_DECREF(item);
8799 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800}
8801
Victor Stinner89a76ab2014-04-05 11:44:04 +02008802static int
8803unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8804 Py_UCS1 *translate)
8805{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008806 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 int ret = 0;
8808
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809 if (charmaptranslate_lookup(ch, mapping, &item)) {
8810 return -1;
8811 }
8812
8813 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008814 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008815 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 /* not found => default to 1:1 mapping */
8819 translate[ch] = ch;
8820 return 1;
8821 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008823 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008824 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8825 used it */
8826 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 /* invalid character or character outside ASCII:
8828 skip the fast translate */
8829 goto exit;
8830 }
8831 translate[ch] = (Py_UCS1)replace;
8832 }
8833 else if (PyUnicode_Check(item)) {
8834 Py_UCS4 replace;
8835
8836 if (PyUnicode_READY(item) == -1) {
8837 Py_DECREF(item);
8838 return -1;
8839 }
8840 if (PyUnicode_GET_LENGTH(item) != 1)
8841 goto exit;
8842
8843 replace = PyUnicode_READ_CHAR(item, 0);
8844 if (replace > 127)
8845 goto exit;
8846 translate[ch] = (Py_UCS1)replace;
8847 }
8848 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008849 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850 goto exit;
8851 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 ret = 1;
8853
Benjamin Peterson1365de72014-04-07 20:15:41 -04008854 exit:
8855 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 return ret;
8857}
8858
8859/* Fast path for ascii => ascii translation. Return 1 if the whole string
8860 was translated into writer, return 0 if the input string was partially
8861 translated into writer, raise an exception and return -1 on error. */
8862static int
8863unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008864 _PyUnicodeWriter *writer, int ignore,
8865 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866{
Victor Stinner872b2912014-04-05 14:27:07 +02008867 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 Py_ssize_t len;
8869 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008870 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 len = PyUnicode_GET_LENGTH(input);
8873
Victor Stinner872b2912014-04-05 14:27:07 +02008874 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875
8876 in = PyUnicode_1BYTE_DATA(input);
8877 end = in + len;
8878
8879 assert(PyUnicode_IS_ASCII(writer->buffer));
8880 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8881 out = PyUnicode_1BYTE_DATA(writer->buffer);
8882
Victor Stinner872b2912014-04-05 14:27:07 +02008883 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008885 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008887 int translate = unicode_fast_translate_lookup(mapping, ch,
8888 ascii_table);
8889 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008891 if (translate == 0)
8892 goto exit;
8893 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 }
Victor Stinner872b2912014-04-05 14:27:07 +02008895 if (ch2 == 0xfe) {
8896 if (ignore)
8897 continue;
8898 goto exit;
8899 }
8900 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008902 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 }
Victor Stinner872b2912014-04-05 14:27:07 +02008904 res = 1;
8905
8906exit:
8907 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008908 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008909 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910}
8911
Victor Stinner3222da22015-10-01 22:07:32 +02008912static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913_PyUnicode_TranslateCharmap(PyObject *input,
8914 PyObject *mapping,
8915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008918 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 Py_ssize_t size, i;
8920 int kind;
8921 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008922 _PyUnicodeWriter writer;
8923 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008924 char *reason = "character maps to <undefined>";
8925 PyObject *errorHandler = NULL;
8926 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008929
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 PyErr_BadArgument();
8932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 if (PyUnicode_READY(input) == -1)
8936 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008937 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 kind = PyUnicode_KIND(input);
8939 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008941 if (size == 0)
8942 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 /* allocate enough for a simple 1:1 translation without
8945 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008946 _PyUnicodeWriter_Init(&writer);
8947 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949
Victor Stinner872b2912014-04-05 14:27:07 +02008950 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8951
Victor Stinner33798672016-03-01 21:59:58 +01008952 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008953 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008954 if (PyUnicode_IS_ASCII(input)) {
8955 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8956 if (res < 0) {
8957 _PyUnicodeWriter_Dealloc(&writer);
8958 return NULL;
8959 }
8960 if (res == 1)
8961 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008962 }
Victor Stinner33798672016-03-01 21:59:58 +01008963 else {
8964 i = 0;
8965 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 int translate;
8970 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8971 Py_ssize_t newpos;
8972 /* startpos for collecting untranslatable chars */
8973 Py_ssize_t collstart;
8974 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008975 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 ch = PyUnicode_READ(kind, data, i);
8978 translate = charmaptranslate_output(ch, mapping, &writer);
8979 if (translate < 0)
8980 goto onError;
8981
8982 if (translate != 0) {
8983 /* it worked => adjust input pointer */
8984 ++i;
8985 continue;
8986 }
8987
8988 /* untranslatable character */
8989 collstart = i;
8990 collend = i+1;
8991
8992 /* find all untranslatable characters */
8993 while (collend < size) {
8994 PyObject *x;
8995 ch = PyUnicode_READ(kind, data, collend);
8996 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008997 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008998 Py_XDECREF(x);
8999 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009001 ++collend;
9002 }
9003
9004 if (ignore) {
9005 i = collend;
9006 }
9007 else {
9008 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9009 reason, input, &exc,
9010 collstart, collend, &newpos);
9011 if (repunicode == NULL)
9012 goto onError;
9013 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009015 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009016 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 Py_DECREF(repunicode);
9018 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009019 }
9020 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009021 Py_XDECREF(exc);
9022 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 Py_XDECREF(exc);
9028 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 return NULL;
9030}
9031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032/* Deprecated. Use PyUnicode_Translate instead. */
9033PyObject *
9034PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9035 Py_ssize_t size,
9036 PyObject *mapping,
9037 const char *errors)
9038{
Christian Heimes5f520f42012-09-11 14:03:25 +02009039 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009040 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 if (!unicode)
9042 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009043 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9044 Py_DECREF(unicode);
9045 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046}
9047
Alexander Belopolsky40018472011-02-26 01:02:56 +00009048PyObject *
9049PyUnicode_Translate(PyObject *str,
9050 PyObject *mapping,
9051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009053 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009054 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009055 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056}
Tim Petersced69f82003-09-16 20:30:58 +00009057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009059fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060{
9061 /* No need to call PyUnicode_READY(self) because this function is only
9062 called as a callback from fixup() which does it already. */
9063 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9064 const int kind = PyUnicode_KIND(self);
9065 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009066 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009067 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 Py_ssize_t i;
9069
9070 for (i = 0; i < len; ++i) {
9071 ch = PyUnicode_READ(kind, data, i);
9072 fixed = 0;
9073 if (ch > 127) {
9074 if (Py_UNICODE_ISSPACE(ch))
9075 fixed = ' ';
9076 else {
9077 const int decimal = Py_UNICODE_TODECIMAL(ch);
9078 if (decimal >= 0)
9079 fixed = '0' + decimal;
9080 }
9081 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009082 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009083 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 PyUnicode_WRITE(kind, data, i, fixed);
9085 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009086 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009087 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 }
9090
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009091 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092}
9093
9094PyObject *
9095_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9096{
9097 if (!PyUnicode_Check(unicode)) {
9098 PyErr_BadInternalCall();
9099 return NULL;
9100 }
9101 if (PyUnicode_READY(unicode) == -1)
9102 return NULL;
9103 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9104 /* If the string is already ASCII, just return the same string */
9105 Py_INCREF(unicode);
9106 return unicode;
9107 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009108 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109}
9110
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009111PyObject *
9112PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9113 Py_ssize_t length)
9114{
Victor Stinnerf0124502011-11-21 23:12:56 +01009115 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009116 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009117 Py_UCS4 maxchar;
9118 enum PyUnicode_Kind kind;
9119 void *data;
9120
Victor Stinner99d7ad02012-02-22 13:37:39 +01009121 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009122 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009123 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009124 if (ch > 127) {
9125 int decimal = Py_UNICODE_TODECIMAL(ch);
9126 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009127 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009128 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 }
9130 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009131
9132 /* Copy to a new string */
9133 decimal = PyUnicode_New(length, maxchar);
9134 if (decimal == NULL)
9135 return decimal;
9136 kind = PyUnicode_KIND(decimal);
9137 data = PyUnicode_DATA(decimal);
9138 /* Iterate over code points */
9139 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009140 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009141 if (ch > 127) {
9142 int decimal = Py_UNICODE_TODECIMAL(ch);
9143 if (decimal >= 0)
9144 ch = '0' + decimal;
9145 }
9146 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009148 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009149}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009150/* --- Decimal Encoder ---------------------------------------------------- */
9151
Alexander Belopolsky40018472011-02-26 01:02:56 +00009152int
9153PyUnicode_EncodeDecimal(Py_UNICODE *s,
9154 Py_ssize_t length,
9155 char *output,
9156 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009158 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009159 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009160 enum PyUnicode_Kind kind;
9161 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009162
9163 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 PyErr_BadArgument();
9165 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009166 }
9167
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009168 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009169 if (unicode == NULL)
9170 return -1;
9171
Victor Stinner42bf7752011-11-21 22:52:58 +01009172 kind = PyUnicode_KIND(unicode);
9173 data = PyUnicode_DATA(unicode);
9174
Victor Stinnerb84d7232011-11-22 01:50:07 +01009175 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009176 PyObject *exc;
9177 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009179 Py_ssize_t startpos;
9180
9181 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009182
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 decimal = Py_UNICODE_TODECIMAL(ch);
9189 if (decimal >= 0) {
9190 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009191 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 continue;
9193 }
9194 if (0 < ch && ch < 256) {
9195 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009196 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 continue;
9198 }
Victor Stinner6345be92011-11-25 20:09:01 +01009199
Victor Stinner42bf7752011-11-21 22:52:58 +01009200 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009201 exc = NULL;
9202 raise_encode_exception(&exc, "decimal", unicode,
9203 startpos, startpos+1,
9204 "invalid decimal Unicode string");
9205 Py_XDECREF(exc);
9206 Py_DECREF(unicode);
9207 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009208 }
9209 /* 0-terminate the output string */
9210 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009211 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009212 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009213}
9214
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215/* --- Helpers ------------------------------------------------------------ */
9216
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009217/* helper macro to fixup start/end slice values */
9218#define ADJUST_INDICES(start, end, len) \
9219 if (end > len) \
9220 end = len; \
9221 else if (end < 0) { \
9222 end += len; \
9223 if (end < 0) \
9224 end = 0; \
9225 } \
9226 if (start < 0) { \
9227 start += len; \
9228 if (start < 0) \
9229 start = 0; \
9230 }
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009233any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009235 Py_ssize_t end,
9236 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009238 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 void *buf1, *buf2;
9240 Py_ssize_t len1, len2, result;
9241
9242 kind1 = PyUnicode_KIND(s1);
9243 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009244 if (kind1 < kind2)
9245 return -1;
9246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 len1 = PyUnicode_GET_LENGTH(s1);
9248 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 ADJUST_INDICES(start, end, len1);
9250 if (end - start < len2)
9251 return -1;
9252
9253 buf1 = PyUnicode_DATA(s1);
9254 buf2 = PyUnicode_DATA(s2);
9255 if (len2 == 1) {
9256 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9257 result = findchar((const char *)buf1 + kind1*start,
9258 kind1, end - start, ch, direction);
9259 if (result == -1)
9260 return -1;
9261 else
9262 return start + result;
9263 }
9264
9265 if (kind2 != kind1) {
9266 buf2 = _PyUnicode_AsKind(s2, kind1);
9267 if (!buf2)
9268 return -2;
9269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270
Victor Stinner794d5672011-10-10 03:21:36 +02009271 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009272 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009273 case PyUnicode_1BYTE_KIND:
9274 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9275 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9276 else
9277 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9278 break;
9279 case PyUnicode_2BYTE_KIND:
9280 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9281 break;
9282 case PyUnicode_4BYTE_KIND:
9283 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9284 break;
9285 default:
9286 assert(0); result = -2;
9287 }
9288 }
9289 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009290 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009291 case PyUnicode_1BYTE_KIND:
9292 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9293 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9294 else
9295 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9296 break;
9297 case PyUnicode_2BYTE_KIND:
9298 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9299 break;
9300 case PyUnicode_4BYTE_KIND:
9301 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 break;
9303 default:
9304 assert(0); result = -2;
9305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 }
9307
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009308 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 PyMem_Free(buf2);
9310
9311 return result;
9312}
9313
9314Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009315_PyUnicode_InsertThousandsGrouping(
9316 PyObject *unicode, Py_ssize_t index,
9317 Py_ssize_t n_buffer,
9318 void *digits, Py_ssize_t n_digits,
9319 Py_ssize_t min_width,
9320 const char *grouping, PyObject *thousands_sep,
9321 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322{
Victor Stinner41a863c2012-02-24 00:37:51 +01009323 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009324 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009325 Py_ssize_t thousands_sep_len;
9326 Py_ssize_t len;
9327
9328 if (unicode != NULL) {
9329 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009330 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 }
9332 else {
9333 kind = PyUnicode_1BYTE_KIND;
9334 data = NULL;
9335 }
9336 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9337 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9338 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9339 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009340 if (thousands_sep_kind < kind) {
9341 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9342 if (!thousands_sep_data)
9343 return -1;
9344 }
9345 else {
9346 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9347 if (!data)
9348 return -1;
9349 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009350 }
9351
Benjamin Petersonead6b532011-12-20 17:23:42 -06009352 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009354 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009356 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009358 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009359 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009367 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009369 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009373 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 break;
9377 default:
9378 assert(0);
9379 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009381 if (unicode != NULL && thousands_sep_kind != kind) {
9382 if (thousands_sep_kind < kind)
9383 PyMem_Free(thousands_sep_data);
9384 else
9385 PyMem_Free(data);
9386 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009387 if (unicode == NULL) {
9388 *maxchar = 127;
9389 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009390 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009391 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009392 }
9393 }
9394 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395}
9396
9397
Alexander Belopolsky40018472011-02-26 01:02:56 +00009398Py_ssize_t
9399PyUnicode_Count(PyObject *str,
9400 PyObject *substr,
9401 Py_ssize_t start,
9402 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009404 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009405 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 void *buf1 = NULL, *buf2 = NULL;
9407 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009408
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009411
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 kind1 = PyUnicode_KIND(str);
9413 kind2 = PyUnicode_KIND(substr);
9414 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009415 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 len1 = PyUnicode_GET_LENGTH(str);
9418 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 buf1 = PyUnicode_DATA(str);
9424 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009426 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 if (!buf2)
9428 goto onError;
9429 }
9430
9431 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009433 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009434 result = asciilib_count(
9435 ((Py_UCS1*)buf1) + start, end - start,
9436 buf2, len2, PY_SSIZE_T_MAX
9437 );
9438 else
9439 result = ucs1lib_count(
9440 ((Py_UCS1*)buf1) + start, end - start,
9441 buf2, len2, PY_SSIZE_T_MAX
9442 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 break;
9444 case PyUnicode_2BYTE_KIND:
9445 result = ucs2lib_count(
9446 ((Py_UCS2*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
9449 break;
9450 case PyUnicode_4BYTE_KIND:
9451 result = ucs4lib_count(
9452 ((Py_UCS4*)buf1) + start, end - start,
9453 buf2, len2, PY_SSIZE_T_MAX
9454 );
9455 break;
9456 default:
9457 assert(0); result = 0;
9458 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009459
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009460 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 PyMem_Free(buf2);
9462
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009465 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 PyMem_Free(buf2);
9467 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468}
9469
Alexander Belopolsky40018472011-02-26 01:02:56 +00009470Py_ssize_t
9471PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009472 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009473 Py_ssize_t start,
9474 Py_ssize_t end,
9475 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009477 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009479
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009480 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483Py_ssize_t
9484PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9485 Py_ssize_t start, Py_ssize_t end,
9486 int direction)
9487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009489 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 if (PyUnicode_READY(str) == -1)
9491 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009492 len = PyUnicode_GET_LENGTH(str);
9493 ADJUST_INDICES(start, end, len);
9494 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009495 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009497 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9498 kind, end-start, ch, direction);
9499 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009501 else
9502 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503}
9504
Alexander Belopolsky40018472011-02-26 01:02:56 +00009505static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009506tailmatch(PyObject *self,
9507 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009508 Py_ssize_t start,
9509 Py_ssize_t end,
9510 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 int kind_self;
9513 int kind_sub;
9514 void *data_self;
9515 void *data_sub;
9516 Py_ssize_t offset;
9517 Py_ssize_t i;
9518 Py_ssize_t end_sub;
9519
9520 if (PyUnicode_READY(self) == -1 ||
9521 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009522 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9525 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009529 if (PyUnicode_GET_LENGTH(substring) == 0)
9530 return 1;
9531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 kind_self = PyUnicode_KIND(self);
9533 data_self = PyUnicode_DATA(self);
9534 kind_sub = PyUnicode_KIND(substring);
9535 data_sub = PyUnicode_DATA(substring);
9536 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9537
9538 if (direction > 0)
9539 offset = end;
9540 else
9541 offset = start;
9542
9543 if (PyUnicode_READ(kind_self, data_self, offset) ==
9544 PyUnicode_READ(kind_sub, data_sub, 0) &&
9545 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9546 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9547 /* If both are of the same kind, memcmp is sufficient */
9548 if (kind_self == kind_sub) {
9549 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009550 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 data_sub,
9552 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009553 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009555 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 else {
9557 /* We do not need to compare 0 and len(substring)-1 because
9558 the if statement above ensured already that they are equal
9559 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 for (i = 1; i < end_sub; ++i) {
9561 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9562 PyUnicode_READ(kind_sub, data_sub, i))
9563 return 0;
9564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009565 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 }
9568
9569 return 0;
9570}
9571
Alexander Belopolsky40018472011-02-26 01:02:56 +00009572Py_ssize_t
9573PyUnicode_Tailmatch(PyObject *str,
9574 PyObject *substr,
9575 Py_ssize_t start,
9576 Py_ssize_t end,
9577 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009579 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009581
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009582 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583}
9584
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585/* Apply fixfct filter to the Unicode object self and return a
9586 reference to the modified object */
9587
Alexander Belopolsky40018472011-02-26 01:02:56 +00009588static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009589fixup(PyObject *self,
9590 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 PyObject *u;
9593 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009594 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009596 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009599 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 /* fix functions return the new maximum character in a string,
9602 if the kind of the resulting unicode object does not change,
9603 everything is fine. Otherwise we need to change the string kind
9604 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009605 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009606
9607 if (maxchar_new == 0) {
9608 /* no changes */;
9609 if (PyUnicode_CheckExact(self)) {
9610 Py_DECREF(u);
9611 Py_INCREF(self);
9612 return self;
9613 }
9614 else
9615 return u;
9616 }
9617
Victor Stinnere6abb482012-05-02 01:15:40 +02009618 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619
Victor Stinnereaab6042011-12-11 22:22:39 +01009620 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009622
9623 /* In case the maximum character changed, we need to
9624 convert the string to the new category. */
9625 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9626 if (v == NULL) {
9627 Py_DECREF(u);
9628 return NULL;
9629 }
9630 if (maxchar_new > maxchar_old) {
9631 /* If the maxchar increased so that the kind changed, not all
9632 characters are representable anymore and we need to fix the
9633 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009634 _PyUnicode_FastCopyCharacters(v, 0,
9635 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009636 maxchar_old = fixfct(v);
9637 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 }
9639 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009640 _PyUnicode_FastCopyCharacters(v, 0,
9641 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009643 Py_DECREF(u);
9644 assert(_PyUnicode_CheckConsistency(v, 1));
9645 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646}
9647
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648static PyObject *
9649ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9652 char *resdata, *data = PyUnicode_DATA(self);
9653 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 res = PyUnicode_New(len, 127);
9656 if (res == NULL)
9657 return NULL;
9658 resdata = PyUnicode_DATA(res);
9659 if (lower)
9660 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 _Py_bytes_upper(resdata, data, len);
9663 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664}
9665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 Py_ssize_t j;
9670 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009671 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009673
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9675
9676 where ! is a negation and \p{xxx} is a character with property xxx.
9677 */
9678 for (j = i - 1; j >= 0; j--) {
9679 c = PyUnicode_READ(kind, data, j);
9680 if (!_PyUnicode_IsCaseIgnorable(c))
9681 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9684 if (final_sigma) {
9685 for (j = i + 1; j < length; j++) {
9686 c = PyUnicode_READ(kind, data, j);
9687 if (!_PyUnicode_IsCaseIgnorable(c))
9688 break;
9689 }
9690 final_sigma = j == length || !_PyUnicode_IsCased(c);
9691 }
9692 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693}
9694
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695static int
9696lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9697 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 /* Obscure special case. */
9700 if (c == 0x3A3) {
9701 mapped[0] = handle_capital_sigma(kind, data, length, i);
9702 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707static Py_ssize_t
9708do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 Py_ssize_t i, k = 0;
9711 int n_res, j;
9712 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009713
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 c = PyUnicode_READ(kind, data, 0);
9715 n_res = _PyUnicode_ToUpperFull(c, mapped);
9716 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009717 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 for (i = 1; i < length; i++) {
9721 c = PyUnicode_READ(kind, data, i);
9722 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9723 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009724 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009726 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009727 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009728 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729}
9730
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731static Py_ssize_t
9732do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9733 Py_ssize_t i, k = 0;
9734
9735 for (i = 0; i < length; i++) {
9736 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9737 int n_res, j;
9738 if (Py_UNICODE_ISUPPER(c)) {
9739 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9740 }
9741 else if (Py_UNICODE_ISLOWER(c)) {
9742 n_res = _PyUnicode_ToUpperFull(c, mapped);
9743 }
9744 else {
9745 n_res = 1;
9746 mapped[0] = c;
9747 }
9748 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009749 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009750 res[k++] = mapped[j];
9751 }
9752 }
9753 return k;
9754}
9755
9756static Py_ssize_t
9757do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9758 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 Py_ssize_t i, k = 0;
9761
9762 for (i = 0; i < length; i++) {
9763 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9764 int n_res, j;
9765 if (lower)
9766 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9767 else
9768 n_res = _PyUnicode_ToUpperFull(c, mapped);
9769 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009770 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 res[k++] = mapped[j];
9772 }
9773 }
9774 return k;
9775}
9776
9777static Py_ssize_t
9778do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9779{
9780 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9781}
9782
9783static Py_ssize_t
9784do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9785{
9786 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9787}
9788
Benjamin Petersone51757f2012-01-12 21:10:29 -05009789static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009790do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 Py_ssize_t i, k = 0;
9793
9794 for (i = 0; i < length; i++) {
9795 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9796 Py_UCS4 mapped[3];
9797 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9798 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009799 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009800 res[k++] = mapped[j];
9801 }
9802 }
9803 return k;
9804}
9805
9806static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009807do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9808{
9809 Py_ssize_t i, k = 0;
9810 int previous_is_cased;
9811
9812 previous_is_cased = 0;
9813 for (i = 0; i < length; i++) {
9814 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9815 Py_UCS4 mapped[3];
9816 int n_res, j;
9817
9818 if (previous_is_cased)
9819 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9820 else
9821 n_res = _PyUnicode_ToTitleFull(c, mapped);
9822
9823 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009824 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009825 res[k++] = mapped[j];
9826 }
9827
9828 previous_is_cased = _PyUnicode_IsCased(c);
9829 }
9830 return k;
9831}
9832
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833static PyObject *
9834case_operation(PyObject *self,
9835 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9836{
9837 PyObject *res = NULL;
9838 Py_ssize_t length, newlength = 0;
9839 int kind, outkind;
9840 void *data, *outdata;
9841 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9842
Benjamin Petersoneea48462012-01-16 14:28:50 -05009843 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844
9845 kind = PyUnicode_KIND(self);
9846 data = PyUnicode_DATA(self);
9847 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009848 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009849 PyErr_SetString(PyExc_OverflowError, "string is too long");
9850 return NULL;
9851 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009852 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853 if (tmp == NULL)
9854 return PyErr_NoMemory();
9855 newlength = perform(kind, data, length, tmp, &maxchar);
9856 res = PyUnicode_New(newlength, maxchar);
9857 if (res == NULL)
9858 goto leave;
9859 tmpend = tmp + newlength;
9860 outdata = PyUnicode_DATA(res);
9861 outkind = PyUnicode_KIND(res);
9862 switch (outkind) {
9863 case PyUnicode_1BYTE_KIND:
9864 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9865 break;
9866 case PyUnicode_2BYTE_KIND:
9867 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9868 break;
9869 case PyUnicode_4BYTE_KIND:
9870 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9871 break;
9872 default:
9873 assert(0);
9874 break;
9875 }
9876 leave:
9877 PyMem_FREE(tmp);
9878 return res;
9879}
9880
Tim Peters8ce9f162004-08-27 01:49:32 +00009881PyObject *
9882PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009884 PyObject *res;
9885 PyObject *fseq;
9886 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009887 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009889 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009890 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009891 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009892 }
9893
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009894 /* NOTE: the following code can't call back into Python code,
9895 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009896 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009897
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009898 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009899 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009900 res = _PyUnicode_JoinArray(separator, items, seqlen);
9901 Py_DECREF(fseq);
9902 return res;
9903}
9904
9905PyObject *
9906_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9907{
9908 PyObject *res = NULL; /* the result */
9909 PyObject *sep = NULL;
9910 Py_ssize_t seplen;
9911 PyObject *item;
9912 Py_ssize_t sz, i, res_offset;
9913 Py_UCS4 maxchar;
9914 Py_UCS4 item_maxchar;
9915 int use_memcpy;
9916 unsigned char *res_data = NULL, *sep_data = NULL;
9917 PyObject *last_obj;
9918 unsigned int kind = 0;
9919
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 /* If empty sequence, return u"". */
9921 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009922 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009923 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009926 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009927 if (seqlen == 1) {
9928 if (PyUnicode_CheckExact(items[0])) {
9929 res = items[0];
9930 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009931 return res;
9932 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009933 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009934 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009935 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009937 /* Set up sep and seplen */
9938 if (separator == NULL) {
9939 /* fall back to a blank space separator */
9940 sep = PyUnicode_FromOrdinal(' ');
9941 if (!sep)
9942 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009943 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009944 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009945 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 else {
9947 if (!PyUnicode_Check(separator)) {
9948 PyErr_Format(PyExc_TypeError,
9949 "separator: expected str instance,"
9950 " %.80s found",
9951 Py_TYPE(separator)->tp_name);
9952 goto onError;
9953 }
9954 if (PyUnicode_READY(separator))
9955 goto onError;
9956 sep = separator;
9957 seplen = PyUnicode_GET_LENGTH(separator);
9958 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9959 /* inc refcount to keep this code path symmetric with the
9960 above case of a blank separator */
9961 Py_INCREF(sep);
9962 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009964 }
9965
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 /* There are at least two things to join, or else we have a subclass
9967 * of str in the sequence.
9968 * Do a pre-pass to figure out the total amount of space we'll
9969 * need (sz), and see whether all argument are strings.
9970 */
9971 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009972#ifdef Py_DEBUG
9973 use_memcpy = 0;
9974#else
9975 use_memcpy = 1;
9976#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009978 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 if (!PyUnicode_Check(item)) {
9981 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009982 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 " %.80s found",
9984 i, Py_TYPE(item)->tp_name);
9985 goto onError;
9986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 if (PyUnicode_READY(item) == -1)
9988 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009989 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009991 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009992 if (i != 0) {
9993 add_sz += seplen;
9994 }
9995 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 goto onError;
9999 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010000 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 if (use_memcpy && last_obj != NULL) {
10002 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10003 use_memcpy = 0;
10004 }
10005 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 }
Tim Petersced69f82003-09-16 20:30:58 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 if (res == NULL)
10010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010011
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 if (use_memcpy) {
10017 res_data = PyUnicode_1BYTE_DATA(res);
10018 kind = PyUnicode_KIND(res);
10019 if (seplen != 0)
10020 sep_data = PyUnicode_1BYTE_DATA(sep);
10021 }
10022#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010023 if (use_memcpy) {
10024 for (i = 0; i < seqlen; ++i) {
10025 Py_ssize_t itemlen;
10026 item = items[i];
10027
10028 /* Copy item, and maybe the separator. */
10029 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010030 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 kind * seplen);
10033 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010034 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010035
10036 itemlen = PyUnicode_GET_LENGTH(item);
10037 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010038 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010040 kind * itemlen);
10041 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010043 }
10044 assert(res_data == PyUnicode_1BYTE_DATA(res)
10045 + kind * PyUnicode_GET_LENGTH(res));
10046 }
10047 else {
10048 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10049 Py_ssize_t itemlen;
10050 item = items[i];
10051
10052 /* Copy item, and maybe the separator. */
10053 if (i && seplen != 0) {
10054 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10055 res_offset += seplen;
10056 }
10057
10058 itemlen = PyUnicode_GET_LENGTH(item);
10059 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010060 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010061 res_offset += itemlen;
10062 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010063 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010065 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010068 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010073 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 return NULL;
10075}
10076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077#define FILL(kind, data, value, start, length) \
10078 do { \
10079 Py_ssize_t i_ = 0; \
10080 assert(kind != PyUnicode_WCHAR_KIND); \
10081 switch ((kind)) { \
10082 case PyUnicode_1BYTE_KIND: { \
10083 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010084 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 break; \
10086 } \
10087 case PyUnicode_2BYTE_KIND: { \
10088 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10089 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10090 break; \
10091 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010092 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10094 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10095 break; \
10096 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010097 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 } \
10099 } while (0)
10100
Victor Stinnerd3f08822012-05-29 12:57:52 +020010101void
10102_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10103 Py_UCS4 fill_char)
10104{
10105 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10106 const void *data = PyUnicode_DATA(unicode);
10107 assert(PyUnicode_IS_READY(unicode));
10108 assert(unicode_modifiable(unicode));
10109 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10110 assert(start >= 0);
10111 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10112 FILL(kind, data, fill_char, start, length);
10113}
10114
Victor Stinner3fe55312012-01-04 00:33:50 +010010115Py_ssize_t
10116PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10117 Py_UCS4 fill_char)
10118{
10119 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010120
10121 if (!PyUnicode_Check(unicode)) {
10122 PyErr_BadInternalCall();
10123 return -1;
10124 }
10125 if (PyUnicode_READY(unicode) == -1)
10126 return -1;
10127 if (unicode_check_modifiable(unicode))
10128 return -1;
10129
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130 if (start < 0) {
10131 PyErr_SetString(PyExc_IndexError, "string index out of range");
10132 return -1;
10133 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010134 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10135 PyErr_SetString(PyExc_ValueError,
10136 "fill character is bigger than "
10137 "the string maximum character");
10138 return -1;
10139 }
10140
10141 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10142 length = Py_MIN(maxlen, length);
10143 if (length <= 0)
10144 return 0;
10145
Victor Stinnerd3f08822012-05-29 12:57:52 +020010146 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010147 return length;
10148}
10149
Victor Stinner9310abb2011-10-05 00:59:23 +020010150static PyObject *
10151pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010152 Py_ssize_t left,
10153 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 PyObject *u;
10157 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010158 int kind;
10159 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161 if (left < 0)
10162 left = 0;
10163 if (right < 0)
10164 right = 0;
10165
Victor Stinnerc4b49542011-12-11 22:44:26 +010010166 if (left == 0 && right == 0)
10167 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10170 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010171 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10172 return NULL;
10173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010175 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010177 if (!u)
10178 return NULL;
10179
10180 kind = PyUnicode_KIND(u);
10181 data = PyUnicode_DATA(u);
10182 if (left)
10183 FILL(kind, data, fill, 0, left);
10184 if (right)
10185 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010186 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010187 assert(_PyUnicode_CheckConsistency(u, 1));
10188 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189}
10190
Alexander Belopolsky40018472011-02-26 01:02:56 +000010191PyObject *
10192PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010196 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
Benjamin Petersonead6b532011-12-20 17:23:42 -060010199 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 if (PyUnicode_IS_ASCII(string))
10202 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 else
10206 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 break;
10210 case PyUnicode_2BYTE_KIND:
10211 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 PyUnicode_GET_LENGTH(string), keepends);
10214 break;
10215 case PyUnicode_4BYTE_KIND:
10216 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 break;
10220 default:
10221 assert(0);
10222 list = 0;
10223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
10226
Alexander Belopolsky40018472011-02-26 01:02:56 +000010227static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010228split(PyObject *self,
10229 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010230 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010232 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 void *buf1, *buf2;
10234 Py_ssize_t len1, len2;
10235 PyObject* out;
10236
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010238 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010244 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 if (PyUnicode_IS_ASCII(self))
10247 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 PyUnicode_GET_LENGTH(self), maxcount
10250 );
10251 else
10252 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_2BYTE_KIND:
10257 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
10261 case PyUnicode_4BYTE_KIND:
10262 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
10266 default:
10267 assert(0);
10268 return NULL;
10269 }
10270
10271 if (PyUnicode_READY(substring) == -1)
10272 return NULL;
10273
10274 kind1 = PyUnicode_KIND(self);
10275 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 len1 = PyUnicode_GET_LENGTH(self);
10277 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010278 if (kind1 < kind2 || len1 < len2) {
10279 out = PyList_New(1);
10280 if (out == NULL)
10281 return NULL;
10282 Py_INCREF(self);
10283 PyList_SET_ITEM(out, 0, self);
10284 return out;
10285 }
10286 buf1 = PyUnicode_DATA(self);
10287 buf2 = PyUnicode_DATA(substring);
10288 if (kind2 != kind1) {
10289 buf2 = _PyUnicode_AsKind(substring, kind1);
10290 if (!buf2)
10291 return NULL;
10292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010294 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10297 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 else
10300 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 break;
10303 case PyUnicode_2BYTE_KIND:
10304 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 break;
10307 case PyUnicode_4BYTE_KIND:
10308 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 default:
10312 out = NULL;
10313 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 PyMem_Free(buf2);
10316 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317}
10318
Alexander Belopolsky40018472011-02-26 01:02:56 +000010319static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010320rsplit(PyObject *self,
10321 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010323{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 void *buf1, *buf2;
10326 Py_ssize_t len1, len2;
10327 PyObject* out;
10328
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010329 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010330 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (PyUnicode_READY(self) == -1)
10333 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010336 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010338 if (PyUnicode_IS_ASCII(self))
10339 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 else
10344 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_2BYTE_KIND:
10349 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 case PyUnicode_4BYTE_KIND:
10354 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
10358 default:
10359 assert(0);
10360 return NULL;
10361 }
10362
10363 if (PyUnicode_READY(substring) == -1)
10364 return NULL;
10365
10366 kind1 = PyUnicode_KIND(self);
10367 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 len1 = PyUnicode_GET_LENGTH(self);
10369 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010370 if (kind1 < kind2 || len1 < len2) {
10371 out = PyList_New(1);
10372 if (out == NULL)
10373 return NULL;
10374 Py_INCREF(self);
10375 PyList_SET_ITEM(out, 0, self);
10376 return out;
10377 }
10378 buf1 = PyUnicode_DATA(self);
10379 buf2 = PyUnicode_DATA(substring);
10380 if (kind2 != kind1) {
10381 buf2 = _PyUnicode_AsKind(substring, kind1);
10382 if (!buf2)
10383 return NULL;
10384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010386 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010388 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10389 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 else
10392 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 break;
10395 case PyUnicode_2BYTE_KIND:
10396 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010397 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 break;
10399 case PyUnicode_4BYTE_KIND:
10400 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 break;
10403 default:
10404 out = NULL;
10405 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010406 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyMem_Free(buf2);
10408 return out;
10409}
10410
10411static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010412anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10413 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010415 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10418 return asciilib_find(buf1, len1, buf2, len2, offset);
10419 else
10420 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 case PyUnicode_2BYTE_KIND:
10422 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10423 case PyUnicode_4BYTE_KIND:
10424 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10425 }
10426 assert(0);
10427 return -1;
10428}
10429
10430static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10432 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010434 switch (kind) {
10435 case PyUnicode_1BYTE_KIND:
10436 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10437 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10438 else
10439 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10440 case PyUnicode_2BYTE_KIND:
10441 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10442 case PyUnicode_4BYTE_KIND:
10443 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10444 }
10445 assert(0);
10446 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010447}
10448
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010449static void
10450replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10451 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10452{
10453 int kind = PyUnicode_KIND(u);
10454 void *data = PyUnicode_DATA(u);
10455 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10456 if (kind == PyUnicode_1BYTE_KIND) {
10457 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10458 (Py_UCS1 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461 else if (kind == PyUnicode_2BYTE_KIND) {
10462 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10463 (Py_UCS2 *)data + len,
10464 u1, u2, maxcount);
10465 }
10466 else {
10467 assert(kind == PyUnicode_4BYTE_KIND);
10468 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10469 (Py_UCS4 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472}
10473
Alexander Belopolsky40018472011-02-26 01:02:56 +000010474static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475replace(PyObject *self, PyObject *str1,
10476 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyObject *u;
10479 char *sbuf = PyUnicode_DATA(self);
10480 char *buf1 = PyUnicode_DATA(str1);
10481 char *buf2 = PyUnicode_DATA(str2);
10482 int srelease = 0, release1 = 0, release2 = 0;
10483 int skind = PyUnicode_KIND(self);
10484 int kind1 = PyUnicode_KIND(str1);
10485 int kind2 = PyUnicode_KIND(str2);
10486 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10487 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10488 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010493 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010495 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Victor Stinner59de0ee2011-10-07 10:01:28 +020010497 if (str1 == str2)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499
Victor Stinner49a0a212011-10-12 23:46:10 +020010500 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010501 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10502 if (maxchar < maxchar_str1)
10503 /* substring too wide to be present */
10504 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10506 /* Replacing str1 with str2 may cause a maxchar reduction in the
10507 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010509 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Victor Stinner69ed0f42013-04-09 21:48:24 +020010520 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010522 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010524 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10530 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 }
10532 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int rkind = skind;
10534 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (kind1 < rkind) {
10538 /* widen substring */
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 if (i < 0)
10545 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (rkind > kind2) {
10547 /* widen replacement */
10548 buf2 = _PyUnicode_AsKind(str2, rkind);
10549 if (!buf2) goto error;
10550 release2 = 1;
10551 }
10552 else if (rkind < kind2) {
10553 /* widen self and buf1 */
10554 rkind = kind2;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 sbuf = _PyUnicode_AsKind(self, rkind);
10558 if (!sbuf) goto error;
10559 srelease = 1;
10560 buf1 = _PyUnicode_AsKind(str1, rkind);
10561 if (!buf1) goto error;
10562 release1 = 1;
10563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 u = PyUnicode_New(slen, maxchar);
10565 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 assert(PyUnicode_KIND(u) == rkind);
10568 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010569
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576
10577 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581 if (i == -1)
10582 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 }
10590 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010592 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 int rkind = skind;
10594 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 if (n == 0)
10604 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf2 = _PyUnicode_AsKind(str2, rkind);
10608 if (!buf2) goto error;
10609 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 rkind = kind2;
10614 sbuf = _PyUnicode_AsKind(self, rkind);
10615 if (!sbuf) goto error;
10616 srelease = 1;
10617 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010618 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf1 = _PyUnicode_AsKind(str1, rkind);
10620 if (!buf1) goto error;
10621 release1 = 1;
10622 }
10623 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10624 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyErr_SetString(PyExc_OverflowError,
10627 "replace string is too long");
10628 goto error;
10629 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010630 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010632 _Py_INCREF_UNICODE_EMPTY();
10633 if (!unicode_empty)
10634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 u = unicode_empty;
10636 goto done;
10637 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010638 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 PyErr_SetString(PyExc_OverflowError,
10640 "replace string is too long");
10641 goto error;
10642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = PyUnicode_New(new_size, maxchar);
10644 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 assert(PyUnicode_KIND(u) == rkind);
10647 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires = i = 0;
10649 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 while (n-- > 0) {
10651 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 if (j == -1)
10656 break;
10657 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
10660 sbuf + rkind * i,
10661 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
10664 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 }
10679 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 /* interleave */
10681 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 if (--n <= 0)
10687 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010688 memcpy(res + rkind * ires,
10689 sbuf + rkind * i,
10690 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 ires++;
10692 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
10695 sbuf + rkind * i,
10696 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010698 }
10699
10700 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010701 unicode_adjust_maxchar(&u);
10702 if (u == NULL)
10703 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705
10706 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010713 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (srelease)
10719 PyMem_FREE(sbuf);
10720 if (release1)
10721 PyMem_FREE(buf1);
10722 if (release2)
10723 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010724 return unicode_result_unchanged(self);
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 error:
10727 if (srelease && sbuf)
10728 PyMem_FREE(sbuf);
10729 if (release1 && buf1)
10730 PyMem_FREE(buf1);
10731 if (release2 && buf2)
10732 PyMem_FREE(buf2);
10733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
10736/* --- Unicode Object Methods --------------------------------------------- */
10737
INADA Naoki3ae20562017-01-16 20:41:20 +090010738/*[clinic input]
10739str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
INADA Naoki3ae20562017-01-16 20:41:20 +090010741Return a version of the string where each word is titlecased.
10742
10743More specifically, words start with uppercased characters and all remaining
10744cased characters have lower case.
10745[clinic start generated code]*/
10746
10747static PyObject *
10748unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010749/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010751 if (PyUnicode_READY(self) == -1)
10752 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010753 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754}
10755
INADA Naoki3ae20562017-01-16 20:41:20 +090010756/*[clinic input]
10757str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758
INADA Naoki3ae20562017-01-16 20:41:20 +090010759Return a capitalized version of the string.
10760
10761More specifically, make the first character have upper case and the rest lower
10762case.
10763[clinic start generated code]*/
10764
10765static PyObject *
10766unicode_capitalize_impl(PyObject *self)
10767/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010769 if (PyUnicode_READY(self) == -1)
10770 return NULL;
10771 if (PyUnicode_GET_LENGTH(self) == 0)
10772 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010773 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774}
10775
INADA Naoki3ae20562017-01-16 20:41:20 +090010776/*[clinic input]
10777str.casefold as unicode_casefold
10778
10779Return a version of the string suitable for caseless comparisons.
10780[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010781
10782static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010783unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010784/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010785{
10786 if (PyUnicode_READY(self) == -1)
10787 return NULL;
10788 if (PyUnicode_IS_ASCII(self))
10789 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010790 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010791}
10792
10793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010794/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795
10796static int
10797convert_uc(PyObject *obj, void *addr)
10798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801 if (!PyUnicode_Check(obj)) {
10802 PyErr_Format(PyExc_TypeError,
10803 "The fill character must be a unicode character, "
10804 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 0;
10806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 if (PyUnicode_READY(obj) < 0)
10808 return 0;
10809 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010810 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010815 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010816}
10817
INADA Naoki3ae20562017-01-16 20:41:20 +090010818/*[clinic input]
10819str.center as unicode_center
10820
10821 width: Py_ssize_t
10822 fillchar: Py_UCS4 = ' '
10823 /
10824
10825Return a centered string of length width.
10826
10827Padding is done using the specified fill character (default is a space).
10828[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
10830static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010831unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10832/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010834 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
Benjamin Petersonbac79492012-01-14 13:34:47 -050010836 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 return NULL;
10838
Victor Stinnerc4b49542011-12-11 22:44:26 +010010839 if (PyUnicode_GET_LENGTH(self) >= width)
10840 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Victor Stinnerc4b49542011-12-11 22:44:26 +010010842 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 left = marg / 2 + (marg & width & 1);
10844
Victor Stinner9310abb2011-10-05 00:59:23 +020010845 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846}
10847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848/* This function assumes that str1 and str2 are readied by the caller. */
10849
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010852{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853#define COMPARE(TYPE1, TYPE2) \
10854 do { \
10855 TYPE1* p1 = (TYPE1 *)data1; \
10856 TYPE2* p2 = (TYPE2 *)data2; \
10857 TYPE1* end = p1 + len; \
10858 Py_UCS4 c1, c2; \
10859 for (; p1 != end; p1++, p2++) { \
10860 c1 = *p1; \
10861 c2 = *p2; \
10862 if (c1 != c2) \
10863 return (c1 < c2) ? -1 : 1; \
10864 } \
10865 } \
10866 while (0)
10867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 int kind1, kind2;
10869 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010870 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 kind1 = PyUnicode_KIND(str1);
10873 kind2 = PyUnicode_KIND(str2);
10874 data1 = PyUnicode_DATA(str1);
10875 data2 = PyUnicode_DATA(str2);
10876 len1 = PyUnicode_GET_LENGTH(str1);
10877 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010878 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010879
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 switch(kind1) {
10881 case PyUnicode_1BYTE_KIND:
10882 {
10883 switch(kind2) {
10884 case PyUnicode_1BYTE_KIND:
10885 {
10886 int cmp = memcmp(data1, data2, len);
10887 /* normalize result of memcmp() into the range [-1; 1] */
10888 if (cmp < 0)
10889 return -1;
10890 if (cmp > 0)
10891 return 1;
10892 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010893 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010894 case PyUnicode_2BYTE_KIND:
10895 COMPARE(Py_UCS1, Py_UCS2);
10896 break;
10897 case PyUnicode_4BYTE_KIND:
10898 COMPARE(Py_UCS1, Py_UCS4);
10899 break;
10900 default:
10901 assert(0);
10902 }
10903 break;
10904 }
10905 case PyUnicode_2BYTE_KIND:
10906 {
10907 switch(kind2) {
10908 case PyUnicode_1BYTE_KIND:
10909 COMPARE(Py_UCS2, Py_UCS1);
10910 break;
10911 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010912 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 COMPARE(Py_UCS2, Py_UCS2);
10914 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010915 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010916 case PyUnicode_4BYTE_KIND:
10917 COMPARE(Py_UCS2, Py_UCS4);
10918 break;
10919 default:
10920 assert(0);
10921 }
10922 break;
10923 }
10924 case PyUnicode_4BYTE_KIND:
10925 {
10926 switch(kind2) {
10927 case PyUnicode_1BYTE_KIND:
10928 COMPARE(Py_UCS4, Py_UCS1);
10929 break;
10930 case PyUnicode_2BYTE_KIND:
10931 COMPARE(Py_UCS4, Py_UCS2);
10932 break;
10933 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010934 {
10935#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10936 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10937 /* normalize result of wmemcmp() into the range [-1; 1] */
10938 if (cmp < 0)
10939 return -1;
10940 if (cmp > 0)
10941 return 1;
10942#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010943 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010944#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010945 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010947 default:
10948 assert(0);
10949 }
10950 break;
10951 }
10952 default:
10953 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010954 }
10955
Victor Stinner770e19e2012-10-04 22:59:45 +020010956 if (len1 == len2)
10957 return 0;
10958 if (len1 < len2)
10959 return -1;
10960 else
10961 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010962
10963#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010964}
10965
Benjamin Peterson621b4302016-09-09 13:54:34 -070010966static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010967unicode_compare_eq(PyObject *str1, PyObject *str2)
10968{
10969 int kind;
10970 void *data1, *data2;
10971 Py_ssize_t len;
10972 int cmp;
10973
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974 len = PyUnicode_GET_LENGTH(str1);
10975 if (PyUnicode_GET_LENGTH(str2) != len)
10976 return 0;
10977 kind = PyUnicode_KIND(str1);
10978 if (PyUnicode_KIND(str2) != kind)
10979 return 0;
10980 data1 = PyUnicode_DATA(str1);
10981 data2 = PyUnicode_DATA(str2);
10982
10983 cmp = memcmp(data1, data2, len * kind);
10984 return (cmp == 0);
10985}
10986
10987
Alexander Belopolsky40018472011-02-26 01:02:56 +000010988int
10989PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10992 if (PyUnicode_READY(left) == -1 ||
10993 PyUnicode_READY(right) == -1)
10994 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010995
10996 /* a string is equal to itself */
10997 if (left == right)
10998 return 0;
10999
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011002 PyErr_Format(PyExc_TypeError,
11003 "Can't compare %.100s and %.100s",
11004 left->ob_type->tp_name,
11005 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 return -1;
11007}
11008
Martin v. Löwis5b222132007-06-10 09:51:05 +000011009int
11010PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 Py_ssize_t i;
11013 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011015 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016
Victor Stinner910337b2011-10-03 03:20:16 +020011017 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011018 if (!PyUnicode_IS_READY(uni)) {
11019 const wchar_t *ws = _PyUnicode_WSTR(uni);
11020 /* Compare Unicode string and source character set string */
11021 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11022 if (chr != ustr[i])
11023 return (chr < ustr[i]) ? -1 : 1;
11024 }
11025 /* This check keeps Python strings that end in '\0' from comparing equal
11026 to C strings identical up to that point. */
11027 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11028 return 1; /* uni is longer */
11029 if (ustr[i])
11030 return -1; /* str is longer */
11031 return 0;
11032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011035 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011036 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 size_t len, len2 = strlen(str);
11038 int cmp;
11039
11040 len = Py_MIN(len1, len2);
11041 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011042 if (cmp != 0) {
11043 if (cmp < 0)
11044 return -1;
11045 else
11046 return 1;
11047 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 if (len1 > len2)
11049 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011050 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011051 return -1; /* str is longer */
11052 return 0;
11053 }
11054 else {
11055 void *data = PyUnicode_DATA(uni);
11056 /* Compare Unicode string and source character set string */
11057 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011058 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11060 /* This check keeps Python strings that end in '\0' from comparing equal
11061 to C strings identical up to that point. */
11062 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11063 return 1; /* uni is longer */
11064 if (str[i])
11065 return -1; /* str is longer */
11066 return 0;
11067 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011068}
11069
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011070static int
11071non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11072{
11073 size_t i, len;
11074 const wchar_t *p;
11075 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11076 if (strlen(str) != len)
11077 return 0;
11078 p = _PyUnicode_WSTR(unicode);
11079 assert(p);
11080 for (i = 0; i < len; i++) {
11081 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011082 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011083 return 0;
11084 }
11085 return 1;
11086}
11087
11088int
11089_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11090{
11091 size_t len;
11092 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011093 assert(str);
11094#ifndef NDEBUG
11095 for (const char *p = str; *p; p++) {
11096 assert((unsigned char)*p < 128);
11097 }
11098#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011099 if (PyUnicode_READY(unicode) == -1) {
11100 /* Memory error or bad data */
11101 PyErr_Clear();
11102 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11103 }
11104 if (!PyUnicode_IS_ASCII(unicode))
11105 return 0;
11106 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11107 return strlen(str) == len &&
11108 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11109}
11110
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011111int
11112_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11113{
11114 PyObject *right_uni;
11115 Py_hash_t hash;
11116
11117 assert(_PyUnicode_CHECK(left));
11118 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011119#ifndef NDEBUG
11120 for (const char *p = right->string; *p; p++) {
11121 assert((unsigned char)*p < 128);
11122 }
11123#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011124
11125 if (PyUnicode_READY(left) == -1) {
11126 /* memory error or bad data */
11127 PyErr_Clear();
11128 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11129 }
11130
11131 if (!PyUnicode_IS_ASCII(left))
11132 return 0;
11133
11134 right_uni = _PyUnicode_FromId(right); /* borrowed */
11135 if (right_uni == NULL) {
11136 /* memory error or bad data */
11137 PyErr_Clear();
11138 return _PyUnicode_EqualToASCIIString(left, right->string);
11139 }
11140
11141 if (left == right_uni)
11142 return 1;
11143
11144 if (PyUnicode_CHECK_INTERNED(left))
11145 return 0;
11146
11147 assert(_PyUnicode_HASH(right_uni) != 1);
11148 hash = _PyUnicode_HASH(left);
11149 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11150 return 0;
11151
11152 return unicode_compare_eq(left, right_uni);
11153}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011154
Benjamin Peterson29060642009-01-31 22:14:21 +000011155#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011156 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011157
Alexander Belopolsky40018472011-02-26 01:02:56 +000011158PyObject *
11159PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011160{
11161 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011162 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163
Victor Stinnere5567ad2012-10-23 02:48:49 +020011164 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11165 Py_RETURN_NOTIMPLEMENTED;
11166
11167 if (PyUnicode_READY(left) == -1 ||
11168 PyUnicode_READY(right) == -1)
11169 return NULL;
11170
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011171 if (left == right) {
11172 switch (op) {
11173 case Py_EQ:
11174 case Py_LE:
11175 case Py_GE:
11176 /* a string is equal to itself */
11177 v = Py_True;
11178 break;
11179 case Py_NE:
11180 case Py_LT:
11181 case Py_GT:
11182 v = Py_False;
11183 break;
11184 default:
11185 PyErr_BadArgument();
11186 return NULL;
11187 }
11188 }
11189 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011190 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011191 result ^= (op == Py_NE);
11192 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011193 }
11194 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011195 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011196
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011197 /* Convert the return value to a Boolean */
11198 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011199 case Py_LE:
11200 v = TEST_COND(result <= 0);
11201 break;
11202 case Py_GE:
11203 v = TEST_COND(result >= 0);
11204 break;
11205 case Py_LT:
11206 v = TEST_COND(result == -1);
11207 break;
11208 case Py_GT:
11209 v = TEST_COND(result == 1);
11210 break;
11211 default:
11212 PyErr_BadArgument();
11213 return NULL;
11214 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011215 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011216 Py_INCREF(v);
11217 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011218}
11219
Alexander Belopolsky40018472011-02-26 01:02:56 +000011220int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011221_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11222{
11223 return unicode_eq(aa, bb);
11224}
11225
11226int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228{
Victor Stinner77282cb2013-04-14 19:22:47 +020011229 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 void *buf1, *buf2;
11231 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011232 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011233
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 "'in <string>' requires string as left operand, not %.100s",
11237 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011238 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011239 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011241 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 if (ensure_unicode(str) < 0)
11243 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011246 kind2 = PyUnicode_KIND(substr);
11247 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011248 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 len2 = PyUnicode_GET_LENGTH(substr);
11251 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011252 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011254 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011255 if (len2 == 1) {
11256 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11257 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011258 return result;
11259 }
11260 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011261 buf2 = _PyUnicode_AsKind(substr, kind1);
11262 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011263 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265
Victor Stinner77282cb2013-04-14 19:22:47 +020011266 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 case PyUnicode_1BYTE_KIND:
11268 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11269 break;
11270 case PyUnicode_2BYTE_KIND:
11271 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11272 break;
11273 case PyUnicode_4BYTE_KIND:
11274 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11275 break;
11276 default:
11277 result = -1;
11278 assert(0);
11279 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011280
Victor Stinner77282cb2013-04-14 19:22:47 +020011281 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 PyMem_Free(buf2);
11283
Guido van Rossum403d68b2000-03-13 15:55:09 +000011284 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011285}
11286
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287/* Concat to string or Unicode object giving a new Unicode object. */
11288
Alexander Belopolsky40018472011-02-26 01:02:56 +000011289PyObject *
11290PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011293 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011296 if (ensure_unicode(left) < 0)
11297 return NULL;
11298
11299 if (!PyUnicode_Check(right)) {
11300 PyErr_Format(PyExc_TypeError,
11301 "can only concatenate str (not \"%.200s\") to str",
11302 right->ob_type->tp_name);
11303 return NULL;
11304 }
11305 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307
11308 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 if (left == unicode_empty)
11310 return PyUnicode_FromObject(right);
11311 if (right == unicode_empty)
11312 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 left_len = PyUnicode_GET_LENGTH(left);
11315 right_len = PyUnicode_GET_LENGTH(right);
11316 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011317 PyErr_SetString(PyExc_OverflowError,
11318 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011319 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011320 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011321 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011322
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011323 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11324 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011325 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011328 result = PyUnicode_New(new_len, maxchar);
11329 if (result == NULL)
11330 return NULL;
11331 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11332 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11333 assert(_PyUnicode_CheckConsistency(result, 1));
11334 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335}
11336
Walter Dörwald1ab83302007-05-18 17:15:44 +000011337void
Victor Stinner23e56682011-10-03 03:54:37 +020011338PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011339{
Victor Stinner23e56682011-10-03 03:54:37 +020011340 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011341 Py_UCS4 maxchar, maxchar2;
11342 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011343
11344 if (p_left == NULL) {
11345 if (!PyErr_Occurred())
11346 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011347 return;
11348 }
Victor Stinner23e56682011-10-03 03:54:37 +020011349 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011350 if (right == NULL || left == NULL
11351 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011352 if (!PyErr_Occurred())
11353 PyErr_BadInternalCall();
11354 goto error;
11355 }
11356
Benjamin Petersonbac79492012-01-14 13:34:47 -050011357 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011358 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011359 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011360 goto error;
11361
Victor Stinner488fa492011-12-12 00:01:39 +010011362 /* Shortcuts */
11363 if (left == unicode_empty) {
11364 Py_DECREF(left);
11365 Py_INCREF(right);
11366 *p_left = right;
11367 return;
11368 }
11369 if (right == unicode_empty)
11370 return;
11371
11372 left_len = PyUnicode_GET_LENGTH(left);
11373 right_len = PyUnicode_GET_LENGTH(right);
11374 if (left_len > PY_SSIZE_T_MAX - right_len) {
11375 PyErr_SetString(PyExc_OverflowError,
11376 "strings are too large to concat");
11377 goto error;
11378 }
11379 new_len = left_len + right_len;
11380
11381 if (unicode_modifiable(left)
11382 && PyUnicode_CheckExact(right)
11383 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011384 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11385 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011386 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011387 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011388 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11389 {
11390 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011391 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011392 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011393
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011394 /* copy 'right' into the newly allocated area of 'left' */
11395 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011396 }
Victor Stinner488fa492011-12-12 00:01:39 +010011397 else {
11398 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11399 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011400 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011401
Victor Stinner488fa492011-12-12 00:01:39 +010011402 /* Concat the two Unicode strings */
11403 res = PyUnicode_New(new_len, maxchar);
11404 if (res == NULL)
11405 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011406 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11407 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011408 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011409 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011410 }
11411 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011412 return;
11413
11414error:
Victor Stinner488fa492011-12-12 00:01:39 +010011415 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011416}
11417
11418void
11419PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11420{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011421 PyUnicode_Append(pleft, right);
11422 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011423}
11424
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011425/*
11426Wraps stringlib_parse_args_finds() and additionally ensures that the
11427first argument is a unicode object.
11428*/
11429
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011430static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011431parse_args_finds_unicode(const char * function_name, PyObject *args,
11432 PyObject **substring,
11433 Py_ssize_t *start, Py_ssize_t *end)
11434{
11435 if(stringlib_parse_args_finds(function_name, args, substring,
11436 start, end)) {
11437 if (ensure_unicode(*substring) < 0)
11438 return 0;
11439 return 1;
11440 }
11441 return 0;
11442}
11443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011444PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011447Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011448string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011449interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
11451static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011452unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011454 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011455 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011456 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 void *buf1, *buf2;
11460 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011462 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 kind1 = PyUnicode_KIND(self);
11466 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011467 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011468 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 len1 = PyUnicode_GET_LENGTH(self);
11471 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011473 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011474 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011475
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011476 buf1 = PyUnicode_DATA(self);
11477 buf2 = PyUnicode_DATA(substring);
11478 if (kind2 != kind1) {
11479 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011481 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 }
11483 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 case PyUnicode_1BYTE_KIND:
11485 iresult = ucs1lib_count(
11486 ((Py_UCS1*)buf1) + start, end - start,
11487 buf2, len2, PY_SSIZE_T_MAX
11488 );
11489 break;
11490 case PyUnicode_2BYTE_KIND:
11491 iresult = ucs2lib_count(
11492 ((Py_UCS2*)buf1) + start, end - start,
11493 buf2, len2, PY_SSIZE_T_MAX
11494 );
11495 break;
11496 case PyUnicode_4BYTE_KIND:
11497 iresult = ucs4lib_count(
11498 ((Py_UCS4*)buf1) + start, end - start,
11499 buf2, len2, PY_SSIZE_T_MAX
11500 );
11501 break;
11502 default:
11503 assert(0); iresult = 0;
11504 }
11505
11506 result = PyLong_FromSsize_t(iresult);
11507
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011508 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 return result;
11512}
11513
INADA Naoki3ae20562017-01-16 20:41:20 +090011514/*[clinic input]
11515str.encode as unicode_encode
11516
11517 encoding: str(c_default="NULL") = 'utf-8'
11518 The encoding in which to encode the string.
11519 errors: str(c_default="NULL") = 'strict'
11520 The error handling scheme to use for encoding errors.
11521 The default is 'strict' meaning that encoding errors raise a
11522 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11523 'xmlcharrefreplace' as well as any other name registered with
11524 codecs.register_error that can handle UnicodeEncodeErrors.
11525
11526Encode the string using the codec registered for encoding.
11527[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
11529static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011530unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011531/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011533 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011534}
11535
INADA Naoki3ae20562017-01-16 20:41:20 +090011536/*[clinic input]
11537str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
INADA Naoki3ae20562017-01-16 20:41:20 +090011539 tabsize: int = 8
11540
11541Return a copy where all tab characters are expanded using spaces.
11542
11543If tabsize is not given, a tab size of 8 characters is assumed.
11544[clinic start generated code]*/
11545
11546static PyObject *
11547unicode_expandtabs_impl(PyObject *self, int tabsize)
11548/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011550 Py_ssize_t i, j, line_pos, src_len, incr;
11551 Py_UCS4 ch;
11552 PyObject *u;
11553 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011555 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
Antoine Pitrou22425222011-10-04 19:10:51 +020011557 if (PyUnicode_READY(self) == -1)
11558 return NULL;
11559
Thomas Wouters7e474022000-07-16 12:04:32 +000011560 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 src_len = PyUnicode_GET_LENGTH(self);
11562 i = j = line_pos = 0;
11563 kind = PyUnicode_KIND(self);
11564 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011565 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 for (; i < src_len; i++) {
11567 ch = PyUnicode_READ(kind, src_data, i);
11568 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011569 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011573 goto overflow;
11574 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011576 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 goto overflow;
11581 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 if (ch == '\n' || ch == '\r')
11584 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011587 if (!found)
11588 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011589
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 if (!u)
11593 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
Antoine Pitroue71d5742011-10-04 15:55:09 +020011596 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
Antoine Pitroue71d5742011-10-04 15:55:09 +020011598 for (; i < src_len; i++) {
11599 ch = PyUnicode_READ(kind, src_data, i);
11600 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011602 incr = tabsize - (line_pos % tabsize);
11603 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011604 FILL(kind, dest_data, ' ', j, incr);
11605 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 line_pos++;
11610 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011611 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011612 if (ch == '\n' || ch == '\r')
11613 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011615 }
11616 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011617 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011618
Antoine Pitroue71d5742011-10-04 15:55:09 +020011619 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011620 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626\n\
11627Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011628such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629arguments start and end are interpreted as in slice notation.\n\
11630\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
11633static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011636 /* initialize variables to prevent gcc warning */
11637 PyObject *substring = NULL;
11638 Py_ssize_t start = 0;
11639 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011640 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011642 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011645 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (result == -2)
11651 return NULL;
11652
Christian Heimes217cfd12007-12-02 14:31:20 +000011653 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654}
11655
11656static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011657unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011659 void *data;
11660 enum PyUnicode_Kind kind;
11661 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011662
11663 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11664 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011666 }
11667 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11668 PyErr_SetString(PyExc_IndexError, "string index out of range");
11669 return NULL;
11670 }
11671 kind = PyUnicode_KIND(self);
11672 data = PyUnicode_DATA(self);
11673 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011674 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675}
11676
Guido van Rossumc2504932007-09-18 19:42:40 +000011677/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011678 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011679static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011680unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681{
Guido van Rossumc2504932007-09-18 19:42:40 +000011682 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011683 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011684
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011685#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011686 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011687#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (_PyUnicode_HASH(self) != -1)
11689 return _PyUnicode_HASH(self);
11690 if (PyUnicode_READY(self) == -1)
11691 return -1;
11692 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011693 /*
11694 We make the hash of the empty string be 0, rather than using
11695 (prefix ^ suffix), since this slightly obfuscates the hash secret
11696 */
11697 if (len == 0) {
11698 _PyUnicode_HASH(self) = 0;
11699 return 0;
11700 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011701 x = _Py_HashBytes(PyUnicode_DATA(self),
11702 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011704 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705}
11706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011707PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011710Return the lowest index in S where substring sub is found, \n\
11711such that sub is contained within S[start:end]. Optional\n\
11712arguments start and end are interpreted as in slice notation.\n\
11713\n\
11714Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
11716static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011719 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011720 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011721 PyObject *substring = NULL;
11722 Py_ssize_t start = 0;
11723 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011725 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011728 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011731 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (result == -2)
11734 return NULL;
11735
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 if (result < 0) {
11737 PyErr_SetString(PyExc_ValueError, "substring not found");
11738 return NULL;
11739 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011740
Christian Heimes217cfd12007-12-02 14:31:20 +000011741 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742}
11743
INADA Naoki3ae20562017-01-16 20:41:20 +090011744/*[clinic input]
11745str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
INADA Naoki3ae20562017-01-16 20:41:20 +090011747Return True if the string is a lowercase string, False otherwise.
11748
11749A string is lowercase if all cased characters in the string are lowercase and
11750there is at least one cased character in the string.
11751[clinic start generated code]*/
11752
11753static PyObject *
11754unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011755/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 Py_ssize_t i, length;
11758 int kind;
11759 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 int cased;
11761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (PyUnicode_READY(self) == -1)
11763 return NULL;
11764 length = PyUnicode_GET_LENGTH(self);
11765 kind = PyUnicode_KIND(self);
11766 data = PyUnicode_DATA(self);
11767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 1)
11770 return PyBool_FromLong(
11771 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011773 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011775 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011776
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 for (i = 0; i < length; i++) {
11779 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011780
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011782 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 else if (!cased && Py_UNICODE_ISLOWER(ch))
11784 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011786 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787}
11788
INADA Naoki3ae20562017-01-16 20:41:20 +090011789/*[clinic input]
11790str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
INADA Naoki3ae20562017-01-16 20:41:20 +090011792Return True if the string is an uppercase string, False otherwise.
11793
11794A string is uppercase if all cased characters in the string are uppercase and
11795there is at least one cased character in the string.
11796[clinic start generated code]*/
11797
11798static PyObject *
11799unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011800/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 Py_ssize_t i, length;
11803 int kind;
11804 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805 int cased;
11806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (PyUnicode_READY(self) == -1)
11808 return NULL;
11809 length = PyUnicode_GET_LENGTH(self);
11810 kind = PyUnicode_KIND(self);
11811 data = PyUnicode_DATA(self);
11812
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (length == 1)
11815 return PyBool_FromLong(
11816 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011818 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011820 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011821
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 for (i = 0; i < length; i++) {
11824 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011825
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011827 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 else if (!cased && Py_UNICODE_ISUPPER(ch))
11829 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011831 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832}
11833
INADA Naoki3ae20562017-01-16 20:41:20 +090011834/*[clinic input]
11835str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
INADA Naoki3ae20562017-01-16 20:41:20 +090011837Return True if the string is a title-cased string, False otherwise.
11838
11839In a title-cased string, upper- and title-case characters may only
11840follow uncased characters and lowercase characters only cased ones.
11841[clinic start generated code]*/
11842
11843static PyObject *
11844unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011845/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 Py_ssize_t i, length;
11848 int kind;
11849 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 int cased, previous_is_cased;
11851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (PyUnicode_READY(self) == -1)
11853 return NULL;
11854 length = PyUnicode_GET_LENGTH(self);
11855 kind = PyUnicode_KIND(self);
11856 data = PyUnicode_DATA(self);
11857
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 1) {
11860 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11861 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11862 (Py_UNICODE_ISUPPER(ch) != 0));
11863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011865 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011867 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011868
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 cased = 0;
11870 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 for (i = 0; i < length; i++) {
11872 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011873
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11875 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011876 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 previous_is_cased = 1;
11878 cased = 1;
11879 }
11880 else if (Py_UNICODE_ISLOWER(ch)) {
11881 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011882 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 previous_is_cased = 1;
11884 cased = 1;
11885 }
11886 else
11887 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011889 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
INADA Naoki3ae20562017-01-16 20:41:20 +090011892/*[clinic input]
11893str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
INADA Naoki3ae20562017-01-16 20:41:20 +090011895Return True if the string is a whitespace string, False otherwise.
11896
11897A string is whitespace if all characters in the string are whitespace and there
11898is at least one character in the string.
11899[clinic start generated code]*/
11900
11901static PyObject *
11902unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011903/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 Py_ssize_t i, length;
11906 int kind;
11907 void *data;
11908
11909 if (PyUnicode_READY(self) == -1)
11910 return NULL;
11911 length = PyUnicode_GET_LENGTH(self);
11912 kind = PyUnicode_KIND(self);
11913 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (length == 1)
11917 return PyBool_FromLong(
11918 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011920 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 for (i = 0; i < length; i++) {
11925 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011926 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011927 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930}
11931
INADA Naoki3ae20562017-01-16 20:41:20 +090011932/*[clinic input]
11933str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011934
INADA Naoki3ae20562017-01-16 20:41:20 +090011935Return True if the string is an alphabetic string, False otherwise.
11936
11937A string is alphabetic if all characters in the string are alphabetic and there
11938is at least one character in the string.
11939[clinic start generated code]*/
11940
11941static PyObject *
11942unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011943/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 Py_ssize_t i, length;
11946 int kind;
11947 void *data;
11948
11949 if (PyUnicode_READY(self) == -1)
11950 return NULL;
11951 length = PyUnicode_GET_LENGTH(self);
11952 kind = PyUnicode_KIND(self);
11953 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011955 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 if (length == 1)
11957 return PyBool_FromLong(
11958 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959
11960 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011962 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 for (i = 0; i < length; i++) {
11965 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011966 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011968 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969}
11970
INADA Naoki3ae20562017-01-16 20:41:20 +090011971/*[clinic input]
11972str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011973
INADA Naoki3ae20562017-01-16 20:41:20 +090011974Return True if the string is an alpha-numeric string, False otherwise.
11975
11976A string is alpha-numeric if all characters in the string are alpha-numeric and
11977there is at least one character in the string.
11978[clinic start generated code]*/
11979
11980static PyObject *
11981unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011982/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 int kind;
11985 void *data;
11986 Py_ssize_t len, i;
11987
11988 if (PyUnicode_READY(self) == -1)
11989 return NULL;
11990
11991 kind = PyUnicode_KIND(self);
11992 data = PyUnicode_DATA(self);
11993 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011995 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (len == 1) {
11997 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11998 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11999 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012000
12001 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 for (i = 0; i < len; i++) {
12006 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012007 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012008 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012009 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012010 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011}
12012
INADA Naoki3ae20562017-01-16 20:41:20 +090012013/*[clinic input]
12014str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
INADA Naoki3ae20562017-01-16 20:41:20 +090012016Return True if the string is a decimal string, False otherwise.
12017
12018A string is a decimal string if all characters in the string are decimal and
12019there is at least one character in the string.
12020[clinic start generated code]*/
12021
12022static PyObject *
12023unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012024/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 Py_ssize_t i, length;
12027 int kind;
12028 void *data;
12029
12030 if (PyUnicode_READY(self) == -1)
12031 return NULL;
12032 length = PyUnicode_GET_LENGTH(self);
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (length == 1)
12038 return PyBool_FromLong(
12039 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012041 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012043 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 for (i = 0; i < length; i++) {
12046 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012047 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012049 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050}
12051
INADA Naoki3ae20562017-01-16 20:41:20 +090012052/*[clinic input]
12053str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
INADA Naoki3ae20562017-01-16 20:41:20 +090012055Return True if the string is a digit string, False otherwise.
12056
12057A string is a digit string if all characters in the string are digits and there
12058is at least one character in the string.
12059[clinic start generated code]*/
12060
12061static PyObject *
12062unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012063/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 Py_ssize_t i, length;
12066 int kind;
12067 void *data;
12068
12069 if (PyUnicode_READY(self) == -1)
12070 return NULL;
12071 length = PyUnicode_GET_LENGTH(self);
12072 kind = PyUnicode_KIND(self);
12073 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (length == 1) {
12077 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12078 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012081 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012083 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 for (i = 0; i < length; i++) {
12086 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012087 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012089 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090}
12091
INADA Naoki3ae20562017-01-16 20:41:20 +090012092/*[clinic input]
12093str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
INADA Naoki3ae20562017-01-16 20:41:20 +090012095Return True if the string is a numeric string, False otherwise.
12096
12097A string is numeric if all characters in the string are numeric and there is at
12098least one character in the string.
12099[clinic start generated code]*/
12100
12101static PyObject *
12102unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012103/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 Py_ssize_t i, length;
12106 int kind;
12107 void *data;
12108
12109 if (PyUnicode_READY(self) == -1)
12110 return NULL;
12111 length = PyUnicode_GET_LENGTH(self);
12112 kind = PyUnicode_KIND(self);
12113 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (length == 1)
12117 return PyBool_FromLong(
12118 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012120 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 for (i = 0; i < length; i++) {
12125 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012126 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129}
12130
Martin v. Löwis47383402007-08-15 07:32:56 +000012131int
12132PyUnicode_IsIdentifier(PyObject *self)
12133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 int kind;
12135 void *data;
12136 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012137 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (PyUnicode_READY(self) == -1) {
12140 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 }
12143
12144 /* Special case for empty strings */
12145 if (PyUnicode_GET_LENGTH(self) == 0)
12146 return 0;
12147 kind = PyUnicode_KIND(self);
12148 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012149
12150 /* PEP 3131 says that the first character must be in
12151 XID_Start and subsequent characters in XID_Continue,
12152 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012153 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012154 letters, digits, underscore). However, given the current
12155 definition of XID_Start and XID_Continue, it is sufficient
12156 to check just for these, except that _ must be allowed
12157 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012159 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012160 return 0;
12161
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012162 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012165 return 1;
12166}
12167
INADA Naoki3ae20562017-01-16 20:41:20 +090012168/*[clinic input]
12169str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012170
INADA Naoki3ae20562017-01-16 20:41:20 +090012171Return True if the string is a valid Python identifier, False otherwise.
12172
12173Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12174"class".
12175[clinic start generated code]*/
12176
12177static PyObject *
12178unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012179/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012180{
12181 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12182}
12183
INADA Naoki3ae20562017-01-16 20:41:20 +090012184/*[clinic input]
12185str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012186
INADA Naoki3ae20562017-01-16 20:41:20 +090012187Return True if the string is printable, False otherwise.
12188
12189A string is printable if all of its characters are considered printable in
12190repr() or if it is empty.
12191[clinic start generated code]*/
12192
12193static PyObject *
12194unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012195/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 Py_ssize_t i, length;
12198 int kind;
12199 void *data;
12200
12201 if (PyUnicode_READY(self) == -1)
12202 return NULL;
12203 length = PyUnicode_GET_LENGTH(self);
12204 kind = PyUnicode_KIND(self);
12205 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012206
12207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (length == 1)
12209 return PyBool_FromLong(
12210 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 for (i = 0; i < length; i++) {
12213 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012214 Py_RETURN_FALSE;
12215 }
12216 }
12217 Py_RETURN_TRUE;
12218}
12219
INADA Naoki3ae20562017-01-16 20:41:20 +090012220/*[clinic input]
12221str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222
INADA Naoki3ae20562017-01-16 20:41:20 +090012223 iterable: object
12224 /
12225
12226Concatenate any number of strings.
12227
Martin Panter91a88662017-01-24 00:30:06 +000012228The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012229The result is returned as a new string.
12230
12231Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12232[clinic start generated code]*/
12233
12234static PyObject *
12235unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012236/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
INADA Naoki3ae20562017-01-16 20:41:20 +090012238 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239}
12240
Martin v. Löwis18e16552006-02-15 17:27:45 +000012241static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012242unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 if (PyUnicode_READY(self) == -1)
12245 return -1;
12246 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
INADA Naoki3ae20562017-01-16 20:41:20 +090012249/*[clinic input]
12250str.ljust as unicode_ljust
12251
12252 width: Py_ssize_t
12253 fillchar: Py_UCS4 = ' '
12254 /
12255
12256Return a left-justified string of length width.
12257
12258Padding is done using the specified fill character (default is a space).
12259[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
12261static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012262unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12263/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012265 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
Victor Stinnerc4b49542011-12-11 22:44:26 +010012268 if (PyUnicode_GET_LENGTH(self) >= width)
12269 return unicode_result_unchanged(self);
12270
12271 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272}
12273
INADA Naoki3ae20562017-01-16 20:41:20 +090012274/*[clinic input]
12275str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
INADA Naoki3ae20562017-01-16 20:41:20 +090012277Return a copy of the string converted to lowercase.
12278[clinic start generated code]*/
12279
12280static PyObject *
12281unicode_lower_impl(PyObject *self)
12282/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012284 if (PyUnicode_READY(self) == -1)
12285 return NULL;
12286 if (PyUnicode_IS_ASCII(self))
12287 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012288 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289}
12290
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291#define LEFTSTRIP 0
12292#define RIGHTSTRIP 1
12293#define BOTHSTRIP 2
12294
12295/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012296static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297
INADA Naoki3ae20562017-01-16 20:41:20 +090012298#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300/* externally visible for str.strip(unicode) */
12301PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012302_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 void *data;
12305 int kind;
12306 Py_ssize_t i, j, len;
12307 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012308 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12311 return NULL;
12312
12313 kind = PyUnicode_KIND(self);
12314 data = PyUnicode_DATA(self);
12315 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012316 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12318 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012319 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012320
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 i = 0;
12322 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012323 while (i < len) {
12324 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12325 if (!BLOOM(sepmask, ch))
12326 break;
12327 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12328 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 i++;
12330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 j = len;
12334 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012335 j--;
12336 while (j >= i) {
12337 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12338 if (!BLOOM(sepmask, ch))
12339 break;
12340 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12341 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012343 }
12344
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012346 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347
Victor Stinner7931d9a2011-11-04 00:22:48 +010012348 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349}
12350
12351PyObject*
12352PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12353{
12354 unsigned char *data;
12355 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012356 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357
Victor Stinnerde636f32011-10-01 03:55:54 +020012358 if (PyUnicode_READY(self) == -1)
12359 return NULL;
12360
Victor Stinner684d5fd2012-05-03 02:32:34 +020012361 length = PyUnicode_GET_LENGTH(self);
12362 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012363
Victor Stinner684d5fd2012-05-03 02:32:34 +020012364 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012365 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366
Victor Stinnerde636f32011-10-01 03:55:54 +020012367 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012368 PyErr_SetString(PyExc_IndexError, "string index out of range");
12369 return NULL;
12370 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012371 if (start >= length || end < start)
12372 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012373
Victor Stinner684d5fd2012-05-03 02:32:34 +020012374 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012375 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012376 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012377 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012378 }
12379 else {
12380 kind = PyUnicode_KIND(self);
12381 data = PyUnicode_1BYTE_DATA(self);
12382 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012383 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012384 length);
12385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
12388static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012389do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 Py_ssize_t len, i, j;
12392
12393 if (PyUnicode_READY(self) == -1)
12394 return NULL;
12395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012397
Victor Stinnercc7af722013-04-09 22:39:24 +020012398 if (PyUnicode_IS_ASCII(self)) {
12399 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12400
12401 i = 0;
12402 if (striptype != RIGHTSTRIP) {
12403 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012404 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012405 if (!_Py_ascii_whitespace[ch])
12406 break;
12407 i++;
12408 }
12409 }
12410
12411 j = len;
12412 if (striptype != LEFTSTRIP) {
12413 j--;
12414 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012415 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012416 if (!_Py_ascii_whitespace[ch])
12417 break;
12418 j--;
12419 }
12420 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012421 }
12422 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012423 else {
12424 int kind = PyUnicode_KIND(self);
12425 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012426
Victor Stinnercc7af722013-04-09 22:39:24 +020012427 i = 0;
12428 if (striptype != RIGHTSTRIP) {
12429 while (i < len) {
12430 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12431 if (!Py_UNICODE_ISSPACE(ch))
12432 break;
12433 i++;
12434 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012435 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012436
12437 j = len;
12438 if (striptype != LEFTSTRIP) {
12439 j--;
12440 while (j >= i) {
12441 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12442 if (!Py_UNICODE_ISSPACE(ch))
12443 break;
12444 j--;
12445 }
12446 j++;
12447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449
Victor Stinner7931d9a2011-11-04 00:22:48 +010012450 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451}
12452
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012453
12454static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012455do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 if (sep != NULL && sep != Py_None) {
12458 if (PyUnicode_Check(sep))
12459 return _PyUnicode_XStrip(self, striptype, sep);
12460 else {
12461 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 "%s arg must be None or str",
12463 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 return NULL;
12465 }
12466 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467
Benjamin Peterson14339b62009-01-31 16:36:08 +000012468 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012469}
12470
12471
INADA Naoki3ae20562017-01-16 20:41:20 +090012472/*[clinic input]
12473str.strip as unicode_strip
12474
12475 chars: object = None
12476 /
12477
Victor Stinner0c4a8282017-01-17 02:21:47 +010012478Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012479
12480If chars is given and not None, remove characters in chars instead.
12481[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012482
12483static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012484unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012485/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012486{
INADA Naoki3ae20562017-01-16 20:41:20 +090012487 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488}
12489
12490
INADA Naoki3ae20562017-01-16 20:41:20 +090012491/*[clinic input]
12492str.lstrip as unicode_lstrip
12493
12494 chars: object = NULL
12495 /
12496
12497Return a copy of the string with leading whitespace removed.
12498
12499If chars is given and not None, remove characters in chars instead.
12500[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012501
12502static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012503unicode_lstrip_impl(PyObject *self, PyObject *chars)
12504/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012505{
INADA Naoki3ae20562017-01-16 20:41:20 +090012506 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507}
12508
12509
INADA Naoki3ae20562017-01-16 20:41:20 +090012510/*[clinic input]
12511str.rstrip as unicode_rstrip
12512
12513 chars: object = NULL
12514 /
12515
12516Return a copy of the string with trailing whitespace removed.
12517
12518If chars is given and not None, remove characters in chars instead.
12519[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012520
12521static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012522unicode_rstrip_impl(PyObject *self, PyObject *chars)
12523/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012524{
INADA Naoki3ae20562017-01-16 20:41:20 +090012525 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012526}
12527
12528
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012530unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012532 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
Serhiy Storchaka05997252013-01-26 12:14:02 +020012535 if (len < 1)
12536 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
Victor Stinnerc4b49542011-12-11 22:44:26 +010012538 /* no repeat, return original string */
12539 if (len == 1)
12540 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012541
Benjamin Petersonbac79492012-01-14 13:34:47 -050012542 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 return NULL;
12544
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012545 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012546 PyErr_SetString(PyExc_OverflowError,
12547 "repeated string is too long");
12548 return NULL;
12549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012551
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012552 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 if (!u)
12554 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012555 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 if (PyUnicode_GET_LENGTH(str) == 1) {
12558 const int kind = PyUnicode_KIND(str);
12559 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012560 if (kind == PyUnicode_1BYTE_KIND) {
12561 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012562 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012563 }
12564 else if (kind == PyUnicode_2BYTE_KIND) {
12565 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012566 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012567 ucs2[n] = fill_char;
12568 } else {
12569 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12570 assert(kind == PyUnicode_4BYTE_KIND);
12571 for (n = 0; n < len; ++n)
12572 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 }
12575 else {
12576 /* number of characters copied this far */
12577 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012578 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012580 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012584 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012585 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 }
12588
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012589 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012590 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591}
12592
Alexander Belopolsky40018472011-02-26 01:02:56 +000012593PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012594PyUnicode_Replace(PyObject *str,
12595 PyObject *substr,
12596 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012597 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012599 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12600 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012602 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603}
12604
INADA Naoki3ae20562017-01-16 20:41:20 +090012605/*[clinic input]
12606str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607
INADA Naoki3ae20562017-01-16 20:41:20 +090012608 old: unicode
12609 new: unicode
12610 count: Py_ssize_t = -1
12611 Maximum number of occurrences to replace.
12612 -1 (the default value) means replace all occurrences.
12613 /
12614
12615Return a copy with all occurrences of substring old replaced by new.
12616
12617If the optional argument count is given, only the first count occurrences are
12618replaced.
12619[clinic start generated code]*/
12620
12621static PyObject *
12622unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12623 Py_ssize_t count)
12624/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012626 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012628 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629}
12630
Alexander Belopolsky40018472011-02-26 01:02:56 +000012631static PyObject *
12632unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012634 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 Py_ssize_t isize;
12636 Py_ssize_t osize, squote, dquote, i, o;
12637 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012638 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012642 return NULL;
12643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 isize = PyUnicode_GET_LENGTH(unicode);
12645 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 /* Compute length of output, quote characters, and
12648 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012649 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 max = 127;
12651 squote = dquote = 0;
12652 ikind = PyUnicode_KIND(unicode);
12653 for (i = 0; i < isize; i++) {
12654 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012655 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012657 case '\'': squote++; break;
12658 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 incr = 2;
12661 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 default:
12663 /* Fast-path ASCII */
12664 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 ;
12668 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012675 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012677 if (osize > PY_SSIZE_T_MAX - incr) {
12678 PyErr_SetString(PyExc_OverflowError,
12679 "string is too long to generate repr");
12680 return NULL;
12681 }
12682 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 }
12684
12685 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012686 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012688 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 if (dquote)
12690 /* Both squote and dquote present. Use squote,
12691 and escape them */
12692 osize += squote;
12693 else
12694 quote = '"';
12695 }
Victor Stinner55c08782013-04-14 18:45:39 +020012696 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697
12698 repr = PyUnicode_New(osize, max);
12699 if (repr == NULL)
12700 return NULL;
12701 okind = PyUnicode_KIND(repr);
12702 odata = PyUnicode_DATA(repr);
12703
12704 PyUnicode_WRITE(okind, odata, 0, quote);
12705 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012706 if (unchanged) {
12707 _PyUnicode_FastCopyCharacters(repr, 1,
12708 unicode, 0,
12709 isize);
12710 }
12711 else {
12712 for (i = 0, o = 1; i < isize; i++) {
12713 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714
Victor Stinner55c08782013-04-14 18:45:39 +020012715 /* Escape quotes and backslashes */
12716 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012717 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012719 continue;
12720 }
12721
12722 /* Map special whitespace to '\t', \n', '\r' */
12723 if (ch == '\t') {
12724 PyUnicode_WRITE(okind, odata, o++, '\\');
12725 PyUnicode_WRITE(okind, odata, o++, 't');
12726 }
12727 else if (ch == '\n') {
12728 PyUnicode_WRITE(okind, odata, o++, '\\');
12729 PyUnicode_WRITE(okind, odata, o++, 'n');
12730 }
12731 else if (ch == '\r') {
12732 PyUnicode_WRITE(okind, odata, o++, '\\');
12733 PyUnicode_WRITE(okind, odata, o++, 'r');
12734 }
12735
12736 /* Map non-printable US ASCII to '\xhh' */
12737 else if (ch < ' ' || ch == 0x7F) {
12738 PyUnicode_WRITE(okind, odata, o++, '\\');
12739 PyUnicode_WRITE(okind, odata, o++, 'x');
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12742 }
12743
12744 /* Copy ASCII characters as-is */
12745 else if (ch < 0x7F) {
12746 PyUnicode_WRITE(okind, odata, o++, ch);
12747 }
12748
12749 /* Non-ASCII characters */
12750 else {
12751 /* Map Unicode whitespace and control characters
12752 (categories Z* and C* except ASCII space)
12753 */
12754 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12755 PyUnicode_WRITE(okind, odata, o++, '\\');
12756 /* Map 8-bit characters to '\xhh' */
12757 if (ch <= 0xff) {
12758 PyUnicode_WRITE(okind, odata, o++, 'x');
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12761 }
12762 /* Map 16-bit characters to '\uxxxx' */
12763 else if (ch <= 0xffff) {
12764 PyUnicode_WRITE(okind, odata, o++, 'u');
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12769 }
12770 /* Map 21-bit characters to '\U00xxxxxx' */
12771 else {
12772 PyUnicode_WRITE(okind, odata, o++, 'U');
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12781 }
12782 }
12783 /* Copy characters as-is */
12784 else {
12785 PyUnicode_WRITE(okind, odata, o++, ch);
12786 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012787 }
12788 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012791 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012792 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793}
12794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012795PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797\n\
12798Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012799such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800arguments start and end are interpreted as in slice notation.\n\
12801\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012802Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803
12804static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012807 /* initialize variables to prevent gcc warning */
12808 PyObject *substring = NULL;
12809 Py_ssize_t start = 0;
12810 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012813 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012816 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012819 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 if (result == -2)
12822 return NULL;
12823
Christian Heimes217cfd12007-12-02 14:31:20 +000012824 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825}
12826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012830Return the highest index in S where substring sub is found,\n\
12831such that sub is contained within S[start:end]. Optional\n\
12832arguments start and end are interpreted as in slice notation.\n\
12833\n\
12834Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
12836static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012839 /* initialize variables to prevent gcc warning */
12840 PyObject *substring = NULL;
12841 Py_ssize_t start = 0;
12842 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012845 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012846 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012848 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012851 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 if (result == -2)
12854 return NULL;
12855
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 if (result < 0) {
12857 PyErr_SetString(PyExc_ValueError, "substring not found");
12858 return NULL;
12859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860
Christian Heimes217cfd12007-12-02 14:31:20 +000012861 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
INADA Naoki3ae20562017-01-16 20:41:20 +090012864/*[clinic input]
12865str.rjust as unicode_rjust
12866
12867 width: Py_ssize_t
12868 fillchar: Py_UCS4 = ' '
12869 /
12870
12871Return a right-justified string of length width.
12872
12873Padding is done using the specified fill character (default is a space).
12874[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
12876static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012877unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12878/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012880 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881 return NULL;
12882
Victor Stinnerc4b49542011-12-11 22:44:26 +010012883 if (PyUnicode_GET_LENGTH(self) >= width)
12884 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885
Victor Stinnerc4b49542011-12-11 22:44:26 +010012886 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887}
12888
Alexander Belopolsky40018472011-02-26 01:02:56 +000012889PyObject *
12890PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012895 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896}
12897
INADA Naoki3ae20562017-01-16 20:41:20 +090012898/*[clinic input]
12899str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
INADA Naoki3ae20562017-01-16 20:41:20 +090012901 sep: object = None
12902 The delimiter according which to split the string.
12903 None (the default value) means split according to any whitespace,
12904 and discard empty strings from the result.
12905 maxsplit: Py_ssize_t = -1
12906 Maximum number of splits to do.
12907 -1 (the default value) means no limit.
12908
12909Return a list of the words in the string, using sep as the delimiter string.
12910[clinic start generated code]*/
12911
12912static PyObject *
12913unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12914/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915{
INADA Naoki3ae20562017-01-16 20:41:20 +090012916 if (sep == Py_None)
12917 return split(self, NULL, maxsplit);
12918 if (PyUnicode_Check(sep))
12919 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012920
12921 PyErr_Format(PyExc_TypeError,
12922 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012923 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925}
12926
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012929{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012931 int kind1, kind2;
12932 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012934
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012935 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937
Victor Stinner14f8f022011-10-05 20:58:25 +020012938 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 len1 = PyUnicode_GET_LENGTH(str_obj);
12941 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012942 if (kind1 < kind2 || len1 < len2) {
12943 _Py_INCREF_UNICODE_EMPTY();
12944 if (!unicode_empty)
12945 out = NULL;
12946 else {
12947 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12948 Py_DECREF(unicode_empty);
12949 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 return out;
12951 }
12952 buf1 = PyUnicode_DATA(str_obj);
12953 buf2 = PyUnicode_DATA(sep_obj);
12954 if (kind2 != kind1) {
12955 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12956 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012957 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012960 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012962 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12963 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12964 else
12965 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 break;
12967 case PyUnicode_2BYTE_KIND:
12968 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12969 break;
12970 case PyUnicode_4BYTE_KIND:
12971 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12972 break;
12973 default:
12974 assert(0);
12975 out = 0;
12976 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012978 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980
12981 return out;
12982}
12983
12984
12985PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 int kind1, kind2;
12990 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012996 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 len1 = PyUnicode_GET_LENGTH(str_obj);
12999 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 if (kind1 < kind2 || len1 < len2) {
13001 _Py_INCREF_UNICODE_EMPTY();
13002 if (!unicode_empty)
13003 out = NULL;
13004 else {
13005 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13006 Py_DECREF(unicode_empty);
13007 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 return out;
13009 }
13010 buf1 = PyUnicode_DATA(str_obj);
13011 buf2 = PyUnicode_DATA(sep_obj);
13012 if (kind2 != kind1) {
13013 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013015 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013020 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 else
13023 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 break;
13025 case PyUnicode_2BYTE_KIND:
13026 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027 break;
13028 case PyUnicode_4BYTE_KIND:
13029 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030 break;
13031 default:
13032 assert(0);
13033 out = 0;
13034 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013036 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013038
13039 return out;
13040}
13041
INADA Naoki3ae20562017-01-16 20:41:20 +090013042/*[clinic input]
13043str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044
INADA Naoki3ae20562017-01-16 20:41:20 +090013045 sep: object
13046 /
13047
13048Partition the string into three parts using the given separator.
13049
13050This will search for the separator in the string. If the separator is found,
13051returns a 3-tuple containing the part before the separator, the separator
13052itself, and the part after it.
13053
13054If the separator is not found, returns a 3-tuple containing the original string
13055and two empty strings.
13056[clinic start generated code]*/
13057
13058static PyObject *
13059unicode_partition(PyObject *self, PyObject *sep)
13060/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013061{
INADA Naoki3ae20562017-01-16 20:41:20 +090013062 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063}
13064
INADA Naoki3ae20562017-01-16 20:41:20 +090013065/*[clinic input]
13066str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067
INADA Naoki3ae20562017-01-16 20:41:20 +090013068Partition the string into three parts using the given separator.
13069
13070This will search for the separator in the string, starting and the end. If
13071the separator is found, returns a 3-tuple containing the part before the
13072separator, the separator itself, and the part after it.
13073
13074If the separator is not found, returns a 3-tuple containing two empty strings
13075and the original string.
13076[clinic start generated code]*/
13077
13078static PyObject *
13079unicode_rpartition(PyObject *self, PyObject *sep)
13080/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013081{
INADA Naoki3ae20562017-01-16 20:41:20 +090013082 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013083}
13084
Alexander Belopolsky40018472011-02-26 01:02:56 +000013085PyObject *
13086PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013088 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013090
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013091 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013092}
13093
INADA Naoki3ae20562017-01-16 20:41:20 +090013094/*[clinic input]
13095str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096
INADA Naoki3ae20562017-01-16 20:41:20 +090013097Return a list of the words in the string, using sep as the delimiter string.
13098
13099Splits are done starting at the end of the string and working to the front.
13100[clinic start generated code]*/
13101
13102static PyObject *
13103unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13104/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013105{
INADA Naoki3ae20562017-01-16 20:41:20 +090013106 if (sep == Py_None)
13107 return rsplit(self, NULL, maxsplit);
13108 if (PyUnicode_Check(sep))
13109 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013110
13111 PyErr_Format(PyExc_TypeError,
13112 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013113 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013114 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013115}
13116
INADA Naoki3ae20562017-01-16 20:41:20 +090013117/*[clinic input]
13118str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013120 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013121
13122Return a list of the lines in the string, breaking at line boundaries.
13123
13124Line breaks are not included in the resulting list unless keepends is given and
13125true.
13126[clinic start generated code]*/
13127
13128static PyObject *
13129unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013130/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133}
13134
13135static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013136PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013138 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
13140
INADA Naoki3ae20562017-01-16 20:41:20 +090013141/*[clinic input]
13142str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143
INADA Naoki3ae20562017-01-16 20:41:20 +090013144Convert uppercase characters to lowercase and lowercase characters to uppercase.
13145[clinic start generated code]*/
13146
13147static PyObject *
13148unicode_swapcase_impl(PyObject *self)
13149/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013151 if (PyUnicode_READY(self) == -1)
13152 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013153 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154}
13155
Larry Hastings61272b72014-01-07 12:41:53 -080013156/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013157
Larry Hastings31826802013-10-19 00:09:25 -070013158@staticmethod
13159str.maketrans as unicode_maketrans
13160
13161 x: object
13162
13163 y: unicode=NULL
13164
13165 z: unicode=NULL
13166
13167 /
13168
13169Return a translation table usable for str.translate().
13170
13171If there is only one argument, it must be a dictionary mapping Unicode
13172ordinals (integers) or characters to Unicode ordinals, strings or None.
13173Character keys will be then converted to ordinals.
13174If there are two arguments, they must be strings of equal length, and
13175in the resulting dictionary, each character in x will be mapped to the
13176character at the same position in y. If there is a third argument, it
13177must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013178[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013179
Larry Hastings31826802013-10-19 00:09:25 -070013180static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013181unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013182/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013183{
Georg Brandlceee0772007-11-27 23:48:05 +000013184 PyObject *new = NULL, *key, *value;
13185 Py_ssize_t i = 0;
13186 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187
Georg Brandlceee0772007-11-27 23:48:05 +000013188 new = PyDict_New();
13189 if (!new)
13190 return NULL;
13191 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 int x_kind, y_kind, z_kind;
13193 void *x_data, *y_data, *z_data;
13194
Georg Brandlceee0772007-11-27 23:48:05 +000013195 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013196 if (!PyUnicode_Check(x)) {
13197 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13198 "be a string if there is a second argument");
13199 goto err;
13200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013202 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13203 "arguments must have equal length");
13204 goto err;
13205 }
13206 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 x_kind = PyUnicode_KIND(x);
13208 y_kind = PyUnicode_KIND(y);
13209 x_data = PyUnicode_DATA(x);
13210 y_data = PyUnicode_DATA(y);
13211 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13212 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013213 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013214 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013215 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013216 if (!value) {
13217 Py_DECREF(key);
13218 goto err;
13219 }
Georg Brandlceee0772007-11-27 23:48:05 +000013220 res = PyDict_SetItem(new, key, value);
13221 Py_DECREF(key);
13222 Py_DECREF(value);
13223 if (res < 0)
13224 goto err;
13225 }
13226 /* create entries for deleting chars in z */
13227 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 z_kind = PyUnicode_KIND(z);
13229 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013230 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013232 if (!key)
13233 goto err;
13234 res = PyDict_SetItem(new, key, Py_None);
13235 Py_DECREF(key);
13236 if (res < 0)
13237 goto err;
13238 }
13239 }
13240 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 int kind;
13242 void *data;
13243
Georg Brandlceee0772007-11-27 23:48:05 +000013244 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013245 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013246 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13247 "to maketrans it must be a dict");
13248 goto err;
13249 }
13250 /* copy entries into the new dict, converting string keys to int keys */
13251 while (PyDict_Next(x, &i, &key, &value)) {
13252 if (PyUnicode_Check(key)) {
13253 /* convert string keys to integer keys */
13254 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013255 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013256 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13257 "table must be of length 1");
13258 goto err;
13259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 kind = PyUnicode_KIND(key);
13261 data = PyUnicode_DATA(key);
13262 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013263 if (!newkey)
13264 goto err;
13265 res = PyDict_SetItem(new, newkey, value);
13266 Py_DECREF(newkey);
13267 if (res < 0)
13268 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013269 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013270 /* just keep integer keys */
13271 if (PyDict_SetItem(new, key, value) < 0)
13272 goto err;
13273 } else {
13274 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13275 "be strings or integers");
13276 goto err;
13277 }
13278 }
13279 }
13280 return new;
13281 err:
13282 Py_DECREF(new);
13283 return NULL;
13284}
13285
INADA Naoki3ae20562017-01-16 20:41:20 +090013286/*[clinic input]
13287str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288
INADA Naoki3ae20562017-01-16 20:41:20 +090013289 table: object
13290 Translation table, which must be a mapping of Unicode ordinals to
13291 Unicode ordinals, strings, or None.
13292 /
13293
13294Replace each character in the string using the given translation table.
13295
13296The table must implement lookup/indexing via __getitem__, for instance a
13297dictionary or list. If this operation raises LookupError, the character is
13298left untouched. Characters mapped to None are deleted.
13299[clinic start generated code]*/
13300
13301static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013303/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306}
13307
INADA Naoki3ae20562017-01-16 20:41:20 +090013308/*[clinic input]
13309str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310
INADA Naoki3ae20562017-01-16 20:41:20 +090013311Return a copy of the string converted to uppercase.
13312[clinic start generated code]*/
13313
13314static PyObject *
13315unicode_upper_impl(PyObject *self)
13316/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013318 if (PyUnicode_READY(self) == -1)
13319 return NULL;
13320 if (PyUnicode_IS_ASCII(self))
13321 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013322 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323}
13324
INADA Naoki3ae20562017-01-16 20:41:20 +090013325/*[clinic input]
13326str.zfill as unicode_zfill
13327
13328 width: Py_ssize_t
13329 /
13330
13331Pad a numeric string with zeros on the left, to fill a field of the given width.
13332
13333The string is never truncated.
13334[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335
13336static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013337unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013338/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013340 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013341 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 int kind;
13343 void *data;
13344 Py_UCS4 chr;
13345
Benjamin Petersonbac79492012-01-14 13:34:47 -050013346 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348
Victor Stinnerc4b49542011-12-11 22:44:26 +010013349 if (PyUnicode_GET_LENGTH(self) >= width)
13350 return unicode_result_unchanged(self);
13351
13352 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
13354 u = pad(self, fill, 0, '0');
13355
Walter Dörwald068325e2002-04-15 13:36:47 +000013356 if (u == NULL)
13357 return NULL;
13358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 kind = PyUnicode_KIND(u);
13360 data = PyUnicode_DATA(u);
13361 chr = PyUnicode_READ(kind, data, fill);
13362
13363 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 PyUnicode_WRITE(kind, data, 0, chr);
13366 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367 }
13368
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013369 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013370 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
13373#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013374static PyObject *
13375unicode__decimal2ascii(PyObject *self)
13376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013378}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379#endif
13380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013381PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013384Return True if S starts with the specified prefix, False otherwise.\n\
13385With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386With optional end, stop comparing S at that position.\n\
13387prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388
13389static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013390unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013394 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013395 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013396 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398
Jesus Ceaac451502011-04-20 17:09:23 +020013399 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 if (PyTuple_Check(subobj)) {
13402 Py_ssize_t i;
13403 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404 substring = PyTuple_GET_ITEM(subobj, i);
13405 if (!PyUnicode_Check(substring)) {
13406 PyErr_Format(PyExc_TypeError,
13407 "tuple for startswith must only contain str, "
13408 "not %.100s",
13409 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013410 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013413 if (result == -1)
13414 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 if (result) {
13416 Py_RETURN_TRUE;
13417 }
13418 }
13419 /* nothing matched */
13420 Py_RETURN_FALSE;
13421 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422 if (!PyUnicode_Check(subobj)) {
13423 PyErr_Format(PyExc_TypeError,
13424 "startswith first arg must be str or "
13425 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013427 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013429 if (result == -1)
13430 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013431 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432}
13433
13434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013435PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013438Return True if S ends with the specified suffix, False otherwise.\n\
13439With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440With optional end, stop comparing S at that position.\n\
13441suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442
13443static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013444unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013448 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013449 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013450 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452
Jesus Ceaac451502011-04-20 17:09:23 +020013453 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 if (PyTuple_Check(subobj)) {
13456 Py_ssize_t i;
13457 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013458 substring = PyTuple_GET_ITEM(subobj, i);
13459 if (!PyUnicode_Check(substring)) {
13460 PyErr_Format(PyExc_TypeError,
13461 "tuple for endswith must only contain str, "
13462 "not %.100s",
13463 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013465 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013466 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013467 if (result == -1)
13468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013469 if (result) {
13470 Py_RETURN_TRUE;
13471 }
13472 }
13473 Py_RETURN_FALSE;
13474 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013475 if (!PyUnicode_Check(subobj)) {
13476 PyErr_Format(PyExc_TypeError,
13477 "endswith first arg must be str or "
13478 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013480 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013481 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013482 if (result == -1)
13483 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013484 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485}
13486
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013487static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013488_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013489{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013490 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13491 writer->data = PyUnicode_DATA(writer->buffer);
13492
13493 if (!writer->readonly) {
13494 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013496 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013498 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13499 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13500 writer->kind = PyUnicode_WCHAR_KIND;
13501 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13502
Victor Stinner8f674cc2013-04-17 23:02:17 +020013503 /* Copy-on-write mode: set buffer size to 0 so
13504 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13505 * next write. */
13506 writer->size = 0;
13507 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013508}
13509
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013511_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013512{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013514
13515 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013517
13518 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13519 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13520 writer->kind = PyUnicode_WCHAR_KIND;
13521 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013522}
13523
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524int
13525_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13526 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013527{
13528 Py_ssize_t newlen;
13529 PyObject *newbuffer;
13530
Victor Stinner2740e462016-09-06 16:58:36 -070013531 assert(maxchar <= MAX_UNICODE);
13532
Victor Stinnerca9381e2015-09-22 00:58:32 +020013533 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013534 assert((maxchar > writer->maxchar && length >= 0)
13535 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536
Victor Stinner202fdca2012-05-07 12:47:02 +020013537 if (length > PY_SSIZE_T_MAX - writer->pos) {
13538 PyErr_NoMemory();
13539 return -1;
13540 }
13541 newlen = writer->pos + length;
13542
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013543 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013544
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013546 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013547 if (writer->overallocate
13548 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13549 /* overallocate to limit the number of realloc() */
13550 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 if (newlen < writer->min_length)
13553 newlen = writer->min_length;
13554
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 writer->buffer = PyUnicode_New(newlen, maxchar);
13556 if (writer->buffer == NULL)
13557 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013560 if (writer->overallocate
13561 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13562 /* overallocate to limit the number of realloc() */
13563 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013565 if (newlen < writer->min_length)
13566 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013568 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013570 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 newbuffer = PyUnicode_New(newlen, maxchar);
13572 if (newbuffer == NULL)
13573 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13575 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013577 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 }
13579 else {
13580 newbuffer = resize_compact(writer->buffer, newlen);
13581 if (newbuffer == NULL)
13582 return -1;
13583 }
13584 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 }
13586 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013587 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 newbuffer = PyUnicode_New(writer->size, maxchar);
13589 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013590 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013591 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13592 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013593 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013594 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013595 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013596 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013597
13598#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013599}
13600
Victor Stinnerca9381e2015-09-22 00:58:32 +020013601int
13602_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13603 enum PyUnicode_Kind kind)
13604{
13605 Py_UCS4 maxchar;
13606
13607 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13608 assert(writer->kind < kind);
13609
13610 switch (kind)
13611 {
13612 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13613 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13614 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13615 default:
13616 assert(0 && "invalid kind");
13617 return -1;
13618 }
13619
13620 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13621}
13622
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013623static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013624_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013625{
Victor Stinner2740e462016-09-06 16:58:36 -070013626 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013627 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13628 return -1;
13629 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13630 writer->pos++;
13631 return 0;
13632}
13633
13634int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013635_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13636{
13637 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13638}
13639
13640int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013641_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13642{
13643 Py_UCS4 maxchar;
13644 Py_ssize_t len;
13645
13646 if (PyUnicode_READY(str) == -1)
13647 return -1;
13648 len = PyUnicode_GET_LENGTH(str);
13649 if (len == 0)
13650 return 0;
13651 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13652 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013653 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013654 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013655 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013656 Py_INCREF(str);
13657 writer->buffer = str;
13658 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013659 writer->pos += len;
13660 return 0;
13661 }
13662 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13663 return -1;
13664 }
13665 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13666 str, 0, len);
13667 writer->pos += len;
13668 return 0;
13669}
13670
Victor Stinnere215d962012-10-06 23:03:36 +020013671int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013672_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13673 Py_ssize_t start, Py_ssize_t end)
13674{
13675 Py_UCS4 maxchar;
13676 Py_ssize_t len;
13677
13678 if (PyUnicode_READY(str) == -1)
13679 return -1;
13680
13681 assert(0 <= start);
13682 assert(end <= PyUnicode_GET_LENGTH(str));
13683 assert(start <= end);
13684
13685 if (end == 0)
13686 return 0;
13687
13688 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13689 return _PyUnicodeWriter_WriteStr(writer, str);
13690
13691 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13692 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13693 else
13694 maxchar = writer->maxchar;
13695 len = end - start;
13696
13697 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13698 return -1;
13699
13700 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13701 str, start, len);
13702 writer->pos += len;
13703 return 0;
13704}
13705
13706int
Victor Stinner4a587072013-11-19 12:54:53 +010013707_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13708 const char *ascii, Py_ssize_t len)
13709{
13710 if (len == -1)
13711 len = strlen(ascii);
13712
13713 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13714
13715 if (writer->buffer == NULL && !writer->overallocate) {
13716 PyObject *str;
13717
13718 str = _PyUnicode_FromASCII(ascii, len);
13719 if (str == NULL)
13720 return -1;
13721
13722 writer->readonly = 1;
13723 writer->buffer = str;
13724 _PyUnicodeWriter_Update(writer);
13725 writer->pos += len;
13726 return 0;
13727 }
13728
13729 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13730 return -1;
13731
13732 switch (writer->kind)
13733 {
13734 case PyUnicode_1BYTE_KIND:
13735 {
13736 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13737 Py_UCS1 *data = writer->data;
13738
Christian Heimesf051e432016-09-13 20:22:02 +020013739 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013740 break;
13741 }
13742 case PyUnicode_2BYTE_KIND:
13743 {
13744 _PyUnicode_CONVERT_BYTES(
13745 Py_UCS1, Py_UCS2,
13746 ascii, ascii + len,
13747 (Py_UCS2 *)writer->data + writer->pos);
13748 break;
13749 }
13750 case PyUnicode_4BYTE_KIND:
13751 {
13752 _PyUnicode_CONVERT_BYTES(
13753 Py_UCS1, Py_UCS4,
13754 ascii, ascii + len,
13755 (Py_UCS4 *)writer->data + writer->pos);
13756 break;
13757 }
13758 default:
13759 assert(0);
13760 }
13761
13762 writer->pos += len;
13763 return 0;
13764}
13765
13766int
13767_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13768 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013769{
13770 Py_UCS4 maxchar;
13771
13772 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13773 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13774 return -1;
13775 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13776 writer->pos += len;
13777 return 0;
13778}
13779
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013781_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013782{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013783 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013784
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013786 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013787 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013788 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013789
13790 str = writer->buffer;
13791 writer->buffer = NULL;
13792
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013793 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013794 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13795 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013796 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013797
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013798 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13799 PyObject *str2;
13800 str2 = resize_compact(str, writer->pos);
13801 if (str2 == NULL) {
13802 Py_DECREF(str);
13803 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013804 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013805 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013806 }
13807
Victor Stinner15a0bd32013-07-08 22:29:55 +020013808 assert(_PyUnicode_CheckConsistency(str, 1));
13809 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013810}
13811
Victor Stinnerd3f08822012-05-29 12:57:52 +020013812void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013813_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013814{
13815 Py_CLEAR(writer->buffer);
13816}
13817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013818#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013819
13820PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013822\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013823Return a formatted version of S, using substitutions from args and kwargs.\n\
13824The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013825
Eric Smith27bbca62010-11-04 17:06:58 +000013826PyDoc_STRVAR(format_map__doc__,
13827 "S.format_map(mapping) -> str\n\
13828\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013829Return a formatted version of S, using substitutions from mapping.\n\
13830The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013831
INADA Naoki3ae20562017-01-16 20:41:20 +090013832/*[clinic input]
13833str.__format__ as unicode___format__
13834
13835 format_spec: unicode
13836 /
13837
13838Return a formatted version of the string as described by format_spec.
13839[clinic start generated code]*/
13840
Eric Smith4a7d76d2008-05-30 18:10:19 +000013841static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013842unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013843/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013844{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 _PyUnicodeWriter writer;
13846 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013847
Victor Stinnerd3f08822012-05-29 12:57:52 +020013848 if (PyUnicode_READY(self) == -1)
13849 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013850 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13852 self, format_spec, 0,
13853 PyUnicode_GET_LENGTH(format_spec));
13854 if (ret == -1) {
13855 _PyUnicodeWriter_Dealloc(&writer);
13856 return NULL;
13857 }
13858 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013859}
13860
INADA Naoki3ae20562017-01-16 20:41:20 +090013861/*[clinic input]
13862str.__sizeof__ as unicode_sizeof
13863
13864Return the size of the string in memory, in bytes.
13865[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013866
13867static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013868unicode_sizeof_impl(PyObject *self)
13869/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 Py_ssize_t size;
13872
13873 /* If it's a compact object, account for base structure +
13874 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013875 if (PyUnicode_IS_COMPACT_ASCII(self))
13876 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13877 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013879 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880 else {
13881 /* If it is a two-block object, account for base object, and
13882 for character block if present. */
13883 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013884 if (_PyUnicode_DATA_ANY(self))
13885 size += (PyUnicode_GET_LENGTH(self) + 1) *
13886 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887 }
13888 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013889 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013890 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13891 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13892 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13893 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894
13895 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013896}
13897
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013898static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013899unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013900{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013901 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 if (!copy)
13903 return NULL;
13904 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013905}
13906
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_ENCODE_METHODDEF
13909 UNICODE_REPLACE_METHODDEF
13910 UNICODE_SPLIT_METHODDEF
13911 UNICODE_RSPLIT_METHODDEF
13912 UNICODE_JOIN_METHODDEF
13913 UNICODE_CAPITALIZE_METHODDEF
13914 UNICODE_CASEFOLD_METHODDEF
13915 UNICODE_TITLE_METHODDEF
13916 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013917 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013918 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013919 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013920 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013921 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013922 UNICODE_LJUST_METHODDEF
13923 UNICODE_LOWER_METHODDEF
13924 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13926 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013927 UNICODE_RJUST_METHODDEF
13928 UNICODE_RSTRIP_METHODDEF
13929 UNICODE_RPARTITION_METHODDEF
13930 UNICODE_SPLITLINES_METHODDEF
13931 UNICODE_STRIP_METHODDEF
13932 UNICODE_SWAPCASE_METHODDEF
13933 UNICODE_TRANSLATE_METHODDEF
13934 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013935 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13936 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013937 UNICODE_ISLOWER_METHODDEF
13938 UNICODE_ISUPPER_METHODDEF
13939 UNICODE_ISTITLE_METHODDEF
13940 UNICODE_ISSPACE_METHODDEF
13941 UNICODE_ISDECIMAL_METHODDEF
13942 UNICODE_ISDIGIT_METHODDEF
13943 UNICODE_ISNUMERIC_METHODDEF
13944 UNICODE_ISALPHA_METHODDEF
13945 UNICODE_ISALNUM_METHODDEF
13946 UNICODE_ISIDENTIFIER_METHODDEF
13947 UNICODE_ISPRINTABLE_METHODDEF
13948 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013949 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013950 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013951 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013952 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013953 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013954#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013955 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013956 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013957#endif
13958
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013960 {NULL, NULL}
13961};
13962
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013963static PyObject *
13964unicode_mod(PyObject *v, PyObject *w)
13965{
Brian Curtindfc80e32011-08-10 20:28:54 -050013966 if (!PyUnicode_Check(v))
13967 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013968 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013969}
13970
13971static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 0, /*nb_add*/
13973 0, /*nb_subtract*/
13974 0, /*nb_multiply*/
13975 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013976};
13977
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 (lenfunc) unicode_length, /* sq_length */
13980 PyUnicode_Concat, /* sq_concat */
13981 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13982 (ssizeargfunc) unicode_getitem, /* sq_item */
13983 0, /* sq_slice */
13984 0, /* sq_ass_item */
13985 0, /* sq_ass_slice */
13986 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013987};
13988
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013989static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013990unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013992 if (PyUnicode_READY(self) == -1)
13993 return NULL;
13994
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013995 if (PyIndex_Check(item)) {
13996 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013997 if (i == -1 && PyErr_Occurred())
13998 return NULL;
13999 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014000 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014001 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014003 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014004 PyObject *result;
14005 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014006 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014007 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014009 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014010 return NULL;
14011 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014012 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14013 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014014
14015 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014016 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014017 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014018 slicelength == PyUnicode_GET_LENGTH(self)) {
14019 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014020 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014021 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014022 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014023 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014025 src_kind = PyUnicode_KIND(self);
14026 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014027 if (!PyUnicode_IS_ASCII(self)) {
14028 kind_limit = kind_maxchar_limit(src_kind);
14029 max_char = 0;
14030 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14031 ch = PyUnicode_READ(src_kind, src_data, cur);
14032 if (ch > max_char) {
14033 max_char = ch;
14034 if (max_char >= kind_limit)
14035 break;
14036 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014037 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 }
Victor Stinner55c99112011-10-13 01:17:06 +020014039 else
14040 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014041 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014042 if (result == NULL)
14043 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014044 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014045 dest_data = PyUnicode_DATA(result);
14046
14047 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014048 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14049 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014050 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014051 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014052 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014053 } else {
14054 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14055 return NULL;
14056 }
14057}
14058
14059static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 (lenfunc)unicode_length, /* mp_length */
14061 (binaryfunc)unicode_subscript, /* mp_subscript */
14062 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014063};
14064
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066/* Helpers for PyUnicode_Format() */
14067
Victor Stinnera47082312012-10-04 02:19:54 +020014068struct unicode_formatter_t {
14069 PyObject *args;
14070 int args_owned;
14071 Py_ssize_t arglen, argidx;
14072 PyObject *dict;
14073
14074 enum PyUnicode_Kind fmtkind;
14075 Py_ssize_t fmtcnt, fmtpos;
14076 void *fmtdata;
14077 PyObject *fmtstr;
14078
14079 _PyUnicodeWriter writer;
14080};
14081
14082struct unicode_format_arg_t {
14083 Py_UCS4 ch;
14084 int flags;
14085 Py_ssize_t width;
14086 int prec;
14087 int sign;
14088};
14089
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014091unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092{
Victor Stinnera47082312012-10-04 02:19:54 +020014093 Py_ssize_t argidx = ctx->argidx;
14094
14095 if (argidx < ctx->arglen) {
14096 ctx->argidx++;
14097 if (ctx->arglen < 0)
14098 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 else
Victor Stinnera47082312012-10-04 02:19:54 +020014100 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101 }
14102 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104 return NULL;
14105}
14106
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014107/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108
Victor Stinnera47082312012-10-04 02:19:54 +020014109/* Format a float into the writer if the writer is not NULL, or into *p_output
14110 otherwise.
14111
14112 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014113static int
Victor Stinnera47082312012-10-04 02:19:54 +020014114formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14115 PyObject **p_output,
14116 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014118 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014121 int prec;
14122 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014123
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124 x = PyFloat_AsDouble(v);
14125 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014126 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014127
Victor Stinnera47082312012-10-04 02:19:54 +020014128 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014129 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014131
Victor Stinnera47082312012-10-04 02:19:54 +020014132 if (arg->flags & F_ALT)
14133 dtoa_flags = Py_DTSF_ALT;
14134 else
14135 dtoa_flags = 0;
14136 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014137 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014138 return -1;
14139 len = strlen(p);
14140 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014141 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014142 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014144 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014145 }
14146 else
14147 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014148 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150}
14151
Victor Stinnerd0880d52012-04-27 23:40:13 +020014152/* formatlong() emulates the format codes d, u, o, x and X, and
14153 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14154 * Python's regular ints.
14155 * Return value: a new PyUnicodeObject*, or NULL if error.
14156 * The output string is of the form
14157 * "-"? ("0x" | "0X")? digit+
14158 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14159 * set in flags. The case of hex digits will be correct,
14160 * There will be at least prec digits, zero-filled on the left if
14161 * necessary to get that many.
14162 * val object to be converted
14163 * flags bitmask of format flags; only F_ALT is looked at
14164 * prec minimum number of digits; 0-fill on left if needed
14165 * type a character in [duoxX]; u acts the same as d
14166 *
14167 * CAUTION: o, x and X conversions on regular ints can never
14168 * produce a '-' sign, but can for Python's unbounded ints.
14169 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014170PyObject *
14171_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014172{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014175 Py_ssize_t i;
14176 int sign; /* 1 if '-', else 0 */
14177 int len; /* number of characters */
14178 Py_ssize_t llen;
14179 int numdigits; /* len == numnondigits + numdigits */
14180 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014181
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 /* Avoid exceeding SSIZE_T_MAX */
14183 if (prec > INT_MAX-3) {
14184 PyErr_SetString(PyExc_OverflowError,
14185 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014187 }
14188
14189 assert(PyLong_Check(val));
14190
14191 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014192 default:
14193 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014195 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014196 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014197 /* int and int subclasses should print numerically when a numeric */
14198 /* format code is used (see issue18780) */
14199 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 break;
14201 case 'o':
14202 numnondigits = 2;
14203 result = PyNumber_ToBase(val, 8);
14204 break;
14205 case 'x':
14206 case 'X':
14207 numnondigits = 2;
14208 result = PyNumber_ToBase(val, 16);
14209 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 }
14211 if (!result)
14212 return NULL;
14213
14214 assert(unicode_modifiable(result));
14215 assert(PyUnicode_IS_READY(result));
14216 assert(PyUnicode_IS_ASCII(result));
14217
14218 /* To modify the string in-place, there can only be one reference. */
14219 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014220 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014221 PyErr_BadInternalCall();
14222 return NULL;
14223 }
14224 buf = PyUnicode_DATA(result);
14225 llen = PyUnicode_GET_LENGTH(result);
14226 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014227 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014228 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014229 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014230 return NULL;
14231 }
14232 len = (int)llen;
14233 sign = buf[0] == '-';
14234 numnondigits += sign;
14235 numdigits = len - numnondigits;
14236 assert(numdigits > 0);
14237
14238 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014239 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014240 (type == 'o' || type == 'x' || type == 'X'))) {
14241 assert(buf[sign] == '0');
14242 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14243 buf[sign+1] == 'o');
14244 numnondigits -= 2;
14245 buf += 2;
14246 len -= 2;
14247 if (sign)
14248 buf[0] = '-';
14249 assert(len == numnondigits + numdigits);
14250 assert(numdigits > 0);
14251 }
14252
14253 /* Fill with leading zeroes to meet minimum width. */
14254 if (prec > numdigits) {
14255 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14256 numnondigits + prec);
14257 char *b1;
14258 if (!r1) {
14259 Py_DECREF(result);
14260 return NULL;
14261 }
14262 b1 = PyBytes_AS_STRING(r1);
14263 for (i = 0; i < numnondigits; ++i)
14264 *b1++ = *buf++;
14265 for (i = 0; i < prec - numdigits; i++)
14266 *b1++ = '0';
14267 for (i = 0; i < numdigits; i++)
14268 *b1++ = *buf++;
14269 *b1 = '\0';
14270 Py_DECREF(result);
14271 result = r1;
14272 buf = PyBytes_AS_STRING(result);
14273 len = numnondigits + prec;
14274 }
14275
14276 /* Fix up case for hex conversions. */
14277 if (type == 'X') {
14278 /* Need to convert all lower case letters to upper case.
14279 and need to convert 0x to 0X (and -0x to -0X). */
14280 for (i = 0; i < len; i++)
14281 if (buf[i] >= 'a' && buf[i] <= 'x')
14282 buf[i] -= 'a'-'A';
14283 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 if (!PyUnicode_Check(result)
14285 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014286 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014287 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014288 Py_DECREF(result);
14289 result = unicode;
14290 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 else if (len != PyUnicode_GET_LENGTH(result)) {
14292 if (PyUnicode_Resize(&result, len) < 0)
14293 Py_CLEAR(result);
14294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014296}
14297
Ethan Furmandf3ed242014-01-05 06:50:30 -080014298/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014300 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 * -1 and raise an exception on error */
14302static int
Victor Stinnera47082312012-10-04 02:19:54 +020014303mainformatlong(PyObject *v,
14304 struct unicode_format_arg_t *arg,
14305 PyObject **p_output,
14306 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307{
14308 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014309 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310
14311 if (!PyNumber_Check(v))
14312 goto wrongtype;
14313
Ethan Furman9ab74802014-03-21 06:38:46 -070014314 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014316 if (type == 'o' || type == 'x' || type == 'X') {
14317 iobj = PyNumber_Index(v);
14318 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014319 if (PyErr_ExceptionMatches(PyExc_TypeError))
14320 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014321 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014322 }
14323 }
14324 else {
14325 iobj = PyNumber_Long(v);
14326 if (iobj == NULL ) {
14327 if (PyErr_ExceptionMatches(PyExc_TypeError))
14328 goto wrongtype;
14329 return -1;
14330 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014331 }
14332 assert(PyLong_Check(iobj));
14333 }
14334 else {
14335 iobj = v;
14336 Py_INCREF(iobj);
14337 }
14338
14339 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014340 && arg->width == -1 && arg->prec == -1
14341 && !(arg->flags & (F_SIGN | F_BLANK))
14342 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 {
14344 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014345 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 int base;
14347
Victor Stinnera47082312012-10-04 02:19:54 +020014348 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014349 {
14350 default:
14351 assert(0 && "'type' not in [diuoxX]");
14352 case 'd':
14353 case 'i':
14354 case 'u':
14355 base = 10;
14356 break;
14357 case 'o':
14358 base = 8;
14359 break;
14360 case 'x':
14361 case 'X':
14362 base = 16;
14363 break;
14364 }
14365
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014366 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14367 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014369 }
14370 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014371 return 1;
14372 }
14373
Ethan Furmanb95b5612015-01-23 20:05:18 -080014374 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 Py_DECREF(iobj);
14376 if (res == NULL)
14377 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014378 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 return 0;
14380
14381wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014382 switch(type)
14383 {
14384 case 'o':
14385 case 'x':
14386 case 'X':
14387 PyErr_Format(PyExc_TypeError,
14388 "%%%c format: an integer is required, "
14389 "not %.200s",
14390 type, Py_TYPE(v)->tp_name);
14391 break;
14392 default:
14393 PyErr_Format(PyExc_TypeError,
14394 "%%%c format: a number is required, "
14395 "not %.200s",
14396 type, Py_TYPE(v)->tp_name);
14397 break;
14398 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014399 return -1;
14400}
14401
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014402static Py_UCS4
14403formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014404{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014405 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014406 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014407 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014408 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014409 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 goto onError;
14411 }
14412 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014413 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014415 /* make sure number is a type of integer */
14416 if (!PyLong_Check(v)) {
14417 iobj = PyNumber_Index(v);
14418 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014419 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014421 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014422 Py_DECREF(iobj);
14423 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014424 else {
14425 x = PyLong_AsLong(v);
14426 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 if (x == -1 && PyErr_Occurred())
14428 goto onError;
14429
Victor Stinner8faf8212011-12-08 22:14:11 +010014430 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 PyErr_SetString(PyExc_OverflowError,
14432 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014433 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 }
14435
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014436 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014438
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014440 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014441 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014442 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443}
14444
Victor Stinnera47082312012-10-04 02:19:54 +020014445/* Parse options of an argument: flags, width, precision.
14446 Handle also "%(name)" syntax.
14447
14448 Return 0 if the argument has been formatted into arg->str.
14449 Return 1 if the argument has been written into ctx->writer,
14450 Raise an exception and return -1 on error. */
14451static int
14452unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14453 struct unicode_format_arg_t *arg)
14454{
14455#define FORMAT_READ(ctx) \
14456 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14457
14458 PyObject *v;
14459
Victor Stinnera47082312012-10-04 02:19:54 +020014460 if (arg->ch == '(') {
14461 /* Get argument value from a dictionary. Example: "%(name)s". */
14462 Py_ssize_t keystart;
14463 Py_ssize_t keylen;
14464 PyObject *key;
14465 int pcount = 1;
14466
14467 if (ctx->dict == NULL) {
14468 PyErr_SetString(PyExc_TypeError,
14469 "format requires a mapping");
14470 return -1;
14471 }
14472 ++ctx->fmtpos;
14473 --ctx->fmtcnt;
14474 keystart = ctx->fmtpos;
14475 /* Skip over balanced parentheses */
14476 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14477 arg->ch = FORMAT_READ(ctx);
14478 if (arg->ch == ')')
14479 --pcount;
14480 else if (arg->ch == '(')
14481 ++pcount;
14482 ctx->fmtpos++;
14483 }
14484 keylen = ctx->fmtpos - keystart - 1;
14485 if (ctx->fmtcnt < 0 || pcount > 0) {
14486 PyErr_SetString(PyExc_ValueError,
14487 "incomplete format key");
14488 return -1;
14489 }
14490 key = PyUnicode_Substring(ctx->fmtstr,
14491 keystart, keystart + keylen);
14492 if (key == NULL)
14493 return -1;
14494 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014495 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014496 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014497 }
14498 ctx->args = PyObject_GetItem(ctx->dict, key);
14499 Py_DECREF(key);
14500 if (ctx->args == NULL)
14501 return -1;
14502 ctx->args_owned = 1;
14503 ctx->arglen = -1;
14504 ctx->argidx = -2;
14505 }
14506
14507 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014508 while (--ctx->fmtcnt >= 0) {
14509 arg->ch = FORMAT_READ(ctx);
14510 ctx->fmtpos++;
14511 switch (arg->ch) {
14512 case '-': arg->flags |= F_LJUST; continue;
14513 case '+': arg->flags |= F_SIGN; continue;
14514 case ' ': arg->flags |= F_BLANK; continue;
14515 case '#': arg->flags |= F_ALT; continue;
14516 case '0': arg->flags |= F_ZERO; continue;
14517 }
14518 break;
14519 }
14520
14521 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014522 if (arg->ch == '*') {
14523 v = unicode_format_getnextarg(ctx);
14524 if (v == NULL)
14525 return -1;
14526 if (!PyLong_Check(v)) {
14527 PyErr_SetString(PyExc_TypeError,
14528 "* wants int");
14529 return -1;
14530 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014531 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014532 if (arg->width == -1 && PyErr_Occurred())
14533 return -1;
14534 if (arg->width < 0) {
14535 arg->flags |= F_LJUST;
14536 arg->width = -arg->width;
14537 }
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 }
14543 else if (arg->ch >= '0' && arg->ch <= '9') {
14544 arg->width = arg->ch - '0';
14545 while (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 if (arg->ch < '0' || arg->ch > '9')
14549 break;
14550 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14551 mixing signed and unsigned comparison. Since arg->ch is between
14552 '0' and '9', casting to int is safe. */
14553 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14554 PyErr_SetString(PyExc_ValueError,
14555 "width too big");
14556 return -1;
14557 }
14558 arg->width = arg->width*10 + (arg->ch - '0');
14559 }
14560 }
14561
14562 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014563 if (arg->ch == '.') {
14564 arg->prec = 0;
14565 if (--ctx->fmtcnt >= 0) {
14566 arg->ch = FORMAT_READ(ctx);
14567 ctx->fmtpos++;
14568 }
14569 if (arg->ch == '*') {
14570 v = unicode_format_getnextarg(ctx);
14571 if (v == NULL)
14572 return -1;
14573 if (!PyLong_Check(v)) {
14574 PyErr_SetString(PyExc_TypeError,
14575 "* wants int");
14576 return -1;
14577 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014578 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014579 if (arg->prec == -1 && PyErr_Occurred())
14580 return -1;
14581 if (arg->prec < 0)
14582 arg->prec = 0;
14583 if (--ctx->fmtcnt >= 0) {
14584 arg->ch = FORMAT_READ(ctx);
14585 ctx->fmtpos++;
14586 }
14587 }
14588 else if (arg->ch >= '0' && arg->ch <= '9') {
14589 arg->prec = arg->ch - '0';
14590 while (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 if (arg->ch < '0' || arg->ch > '9')
14594 break;
14595 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14596 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014597 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014598 return -1;
14599 }
14600 arg->prec = arg->prec*10 + (arg->ch - '0');
14601 }
14602 }
14603 }
14604
14605 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14606 if (ctx->fmtcnt >= 0) {
14607 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14608 if (--ctx->fmtcnt >= 0) {
14609 arg->ch = FORMAT_READ(ctx);
14610 ctx->fmtpos++;
14611 }
14612 }
14613 }
14614 if (ctx->fmtcnt < 0) {
14615 PyErr_SetString(PyExc_ValueError,
14616 "incomplete format");
14617 return -1;
14618 }
14619 return 0;
14620
14621#undef FORMAT_READ
14622}
14623
14624/* Format one argument. Supported conversion specifiers:
14625
14626 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014627 - "i", "d", "u": int or float
14628 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014629 - "e", "E", "f", "F", "g", "G": float
14630 - "c": int or str (1 character)
14631
Victor Stinner8dbd4212012-12-04 09:30:24 +010014632 When possible, the output is written directly into the Unicode writer
14633 (ctx->writer). A string is created when padding is required.
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635 Return 0 if the argument has been formatted into *p_str,
14636 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014637 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014638static int
14639unicode_format_arg_format(struct unicode_formatter_t *ctx,
14640 struct unicode_format_arg_t *arg,
14641 PyObject **p_str)
14642{
14643 PyObject *v;
14644 _PyUnicodeWriter *writer = &ctx->writer;
14645
14646 if (ctx->fmtcnt == 0)
14647 ctx->writer.overallocate = 0;
14648
Victor Stinnera47082312012-10-04 02:19:54 +020014649 v = unicode_format_getnextarg(ctx);
14650 if (v == NULL)
14651 return -1;
14652
Victor Stinnera47082312012-10-04 02:19:54 +020014653
14654 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014655 case 's':
14656 case 'r':
14657 case 'a':
14658 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14659 /* Fast path */
14660 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14661 return -1;
14662 return 1;
14663 }
14664
14665 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14666 *p_str = v;
14667 Py_INCREF(*p_str);
14668 }
14669 else {
14670 if (arg->ch == 's')
14671 *p_str = PyObject_Str(v);
14672 else if (arg->ch == 'r')
14673 *p_str = PyObject_Repr(v);
14674 else
14675 *p_str = PyObject_ASCII(v);
14676 }
14677 break;
14678
14679 case 'i':
14680 case 'd':
14681 case 'u':
14682 case 'o':
14683 case 'x':
14684 case 'X':
14685 {
14686 int ret = mainformatlong(v, arg, p_str, writer);
14687 if (ret != 0)
14688 return ret;
14689 arg->sign = 1;
14690 break;
14691 }
14692
14693 case 'e':
14694 case 'E':
14695 case 'f':
14696 case 'F':
14697 case 'g':
14698 case 'G':
14699 if (arg->width == -1 && arg->prec == -1
14700 && !(arg->flags & (F_SIGN | F_BLANK)))
14701 {
14702 /* Fast path */
14703 if (formatfloat(v, arg, NULL, writer) == -1)
14704 return -1;
14705 return 1;
14706 }
14707
14708 arg->sign = 1;
14709 if (formatfloat(v, arg, p_str, NULL) == -1)
14710 return -1;
14711 break;
14712
14713 case 'c':
14714 {
14715 Py_UCS4 ch = formatchar(v);
14716 if (ch == (Py_UCS4) -1)
14717 return -1;
14718 if (arg->width == -1 && arg->prec == -1) {
14719 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014720 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014721 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014722 return 1;
14723 }
14724 *p_str = PyUnicode_FromOrdinal(ch);
14725 break;
14726 }
14727
14728 default:
14729 PyErr_Format(PyExc_ValueError,
14730 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014731 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014732 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14733 (int)arg->ch,
14734 ctx->fmtpos - 1);
14735 return -1;
14736 }
14737 if (*p_str == NULL)
14738 return -1;
14739 assert (PyUnicode_Check(*p_str));
14740 return 0;
14741}
14742
14743static int
14744unicode_format_arg_output(struct unicode_formatter_t *ctx,
14745 struct unicode_format_arg_t *arg,
14746 PyObject *str)
14747{
14748 Py_ssize_t len;
14749 enum PyUnicode_Kind kind;
14750 void *pbuf;
14751 Py_ssize_t pindex;
14752 Py_UCS4 signchar;
14753 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014754 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014755 Py_ssize_t sublen;
14756 _PyUnicodeWriter *writer = &ctx->writer;
14757 Py_UCS4 fill;
14758
14759 fill = ' ';
14760 if (arg->sign && arg->flags & F_ZERO)
14761 fill = '0';
14762
14763 if (PyUnicode_READY(str) == -1)
14764 return -1;
14765
14766 len = PyUnicode_GET_LENGTH(str);
14767 if ((arg->width == -1 || arg->width <= len)
14768 && (arg->prec == -1 || arg->prec >= len)
14769 && !(arg->flags & (F_SIGN | F_BLANK)))
14770 {
14771 /* Fast path */
14772 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14773 return -1;
14774 return 0;
14775 }
14776
14777 /* Truncate the string for "s", "r" and "a" formats
14778 if the precision is set */
14779 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14780 if (arg->prec >= 0 && len > arg->prec)
14781 len = arg->prec;
14782 }
14783
14784 /* Adjust sign and width */
14785 kind = PyUnicode_KIND(str);
14786 pbuf = PyUnicode_DATA(str);
14787 pindex = 0;
14788 signchar = '\0';
14789 if (arg->sign) {
14790 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14791 if (ch == '-' || ch == '+') {
14792 signchar = ch;
14793 len--;
14794 pindex++;
14795 }
14796 else if (arg->flags & F_SIGN)
14797 signchar = '+';
14798 else if (arg->flags & F_BLANK)
14799 signchar = ' ';
14800 else
14801 arg->sign = 0;
14802 }
14803 if (arg->width < len)
14804 arg->width = len;
14805
14806 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014807 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014808 if (!(arg->flags & F_LJUST)) {
14809 if (arg->sign) {
14810 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014811 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014812 }
14813 else {
14814 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014815 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014816 }
14817 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014818 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14819 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014820 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014821 }
14822
Victor Stinnera47082312012-10-04 02:19:54 +020014823 buflen = arg->width;
14824 if (arg->sign && len == arg->width)
14825 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014826 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014827 return -1;
14828
14829 /* Write the sign if needed */
14830 if (arg->sign) {
14831 if (fill != ' ') {
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14833 writer->pos += 1;
14834 }
14835 if (arg->width > len)
14836 arg->width--;
14837 }
14838
14839 /* Write the numeric prefix for "x", "X" and "o" formats
14840 if the alternate form is used.
14841 For example, write "0x" for the "%#x" format. */
14842 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14845 if (fill != ' ') {
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848 writer->pos += 2;
14849 pindex += 2;
14850 }
14851 arg->width -= 2;
14852 if (arg->width < 0)
14853 arg->width = 0;
14854 len -= 2;
14855 }
14856
14857 /* Pad left with the fill character if needed */
14858 if (arg->width > len && !(arg->flags & F_LJUST)) {
14859 sublen = arg->width - len;
14860 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14861 writer->pos += sublen;
14862 arg->width = len;
14863 }
14864
14865 /* If padding with spaces: write sign if needed and/or numeric prefix if
14866 the alternate form is used */
14867 if (fill == ' ') {
14868 if (arg->sign) {
14869 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14870 writer->pos += 1;
14871 }
14872 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14873 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14874 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14875 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14877 writer->pos += 2;
14878 pindex += 2;
14879 }
14880 }
14881
14882 /* Write characters */
14883 if (len) {
14884 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14885 str, pindex, len);
14886 writer->pos += len;
14887 }
14888
14889 /* Pad right with the fill character if needed */
14890 if (arg->width > len) {
14891 sublen = arg->width - len;
14892 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14893 writer->pos += sublen;
14894 }
14895 return 0;
14896}
14897
14898/* Helper of PyUnicode_Format(): format one arg.
14899 Return 0 on success, raise an exception and return -1 on error. */
14900static int
14901unicode_format_arg(struct unicode_formatter_t *ctx)
14902{
14903 struct unicode_format_arg_t arg;
14904 PyObject *str;
14905 int ret;
14906
Victor Stinner8dbd4212012-12-04 09:30:24 +010014907 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014908 if (arg.ch == '%') {
14909 ctx->fmtpos++;
14910 ctx->fmtcnt--;
14911 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14912 return -1;
14913 return 0;
14914 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014915 arg.flags = 0;
14916 arg.width = -1;
14917 arg.prec = -1;
14918 arg.sign = 0;
14919 str = NULL;
14920
Victor Stinnera47082312012-10-04 02:19:54 +020014921 ret = unicode_format_arg_parse(ctx, &arg);
14922 if (ret == -1)
14923 return -1;
14924
14925 ret = unicode_format_arg_format(ctx, &arg, &str);
14926 if (ret == -1)
14927 return -1;
14928
14929 if (ret != 1) {
14930 ret = unicode_format_arg_output(ctx, &arg, str);
14931 Py_DECREF(str);
14932 if (ret == -1)
14933 return -1;
14934 }
14935
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014936 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014937 PyErr_SetString(PyExc_TypeError,
14938 "not all arguments converted during string formatting");
14939 return -1;
14940 }
14941 return 0;
14942}
14943
Alexander Belopolsky40018472011-02-26 01:02:56 +000014944PyObject *
14945PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946{
Victor Stinnera47082312012-10-04 02:19:54 +020014947 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014948
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014950 PyErr_BadInternalCall();
14951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952 }
Victor Stinnera47082312012-10-04 02:19:54 +020014953
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014954 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014955 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014956
14957 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014958 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14959 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14960 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14961 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014962
Victor Stinner8f674cc2013-04-17 23:02:17 +020014963 _PyUnicodeWriter_Init(&ctx.writer);
14964 ctx.writer.min_length = ctx.fmtcnt + 100;
14965 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014966
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014968 ctx.arglen = PyTuple_Size(args);
14969 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970 }
14971 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014972 ctx.arglen = -1;
14973 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974 }
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014976 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014977 ctx.dict = args;
14978 else
14979 ctx.dict = NULL;
14980 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981
Victor Stinnera47082312012-10-04 02:19:54 +020014982 while (--ctx.fmtcnt >= 0) {
14983 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014984 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014985
14986 nonfmtpos = ctx.fmtpos++;
14987 while (ctx.fmtcnt >= 0 &&
14988 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14989 ctx.fmtpos++;
14990 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 }
Victor Stinnera47082312012-10-04 02:19:54 +020014992 if (ctx.fmtcnt < 0) {
14993 ctx.fmtpos--;
14994 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014995 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014996
Victor Stinnercfc4c132013-04-03 01:48:39 +020014997 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14998 nonfmtpos, ctx.fmtpos) < 0)
14999 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015000 }
15001 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015002 ctx.fmtpos++;
15003 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015004 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015005 }
15006 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015007
Victor Stinnera47082312012-10-04 02:19:54 +020015008 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015009 PyErr_SetString(PyExc_TypeError,
15010 "not all arguments converted during string formatting");
15011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015012 }
15013
Victor Stinnera47082312012-10-04 02:19:54 +020015014 if (ctx.args_owned) {
15015 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015016 }
Victor Stinnera47082312012-10-04 02:19:54 +020015017 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015018
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015020 _PyUnicodeWriter_Dealloc(&ctx.writer);
15021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023 }
15024 return NULL;
15025}
15026
Jeremy Hylton938ace62002-07-17 16:30:39 +000015027static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015028unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15029
Tim Peters6d6c1a32001-08-02 04:15:00 +000015030static PyObject *
15031unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15032{
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 static char *kwlist[] = {"object", "encoding", "errors", 0};
15035 char *encoding = NULL;
15036 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015037
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 if (type != &PyUnicode_Type)
15039 return unicode_subtype_new(type, args, kwds);
15040 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015041 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 return NULL;
15043 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015044 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 if (encoding == NULL && errors == NULL)
15046 return PyObject_Str(x);
15047 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015049}
15050
Guido van Rossume023fe02001-08-30 03:12:59 +000015051static PyObject *
15052unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15053{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015054 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 Py_ssize_t length, char_size;
15056 int share_wstr, share_utf8;
15057 unsigned int kind;
15058 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015059
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015062 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015063 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015065 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015066 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015067 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015069 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015071 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015072 if (self == NULL) {
15073 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 return NULL;
15075 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076 kind = PyUnicode_KIND(unicode);
15077 length = PyUnicode_GET_LENGTH(unicode);
15078
15079 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015080#ifdef Py_DEBUG
15081 _PyUnicode_HASH(self) = -1;
15082#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015084#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015085 _PyUnicode_STATE(self).interned = 0;
15086 _PyUnicode_STATE(self).kind = kind;
15087 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015088 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089 _PyUnicode_STATE(self).ready = 1;
15090 _PyUnicode_WSTR(self) = NULL;
15091 _PyUnicode_UTF8_LENGTH(self) = 0;
15092 _PyUnicode_UTF8(self) = NULL;
15093 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015094 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015095
15096 share_utf8 = 0;
15097 share_wstr = 0;
15098 if (kind == PyUnicode_1BYTE_KIND) {
15099 char_size = 1;
15100 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15101 share_utf8 = 1;
15102 }
15103 else if (kind == PyUnicode_2BYTE_KIND) {
15104 char_size = 2;
15105 if (sizeof(wchar_t) == 2)
15106 share_wstr = 1;
15107 }
15108 else {
15109 assert(kind == PyUnicode_4BYTE_KIND);
15110 char_size = 4;
15111 if (sizeof(wchar_t) == 4)
15112 share_wstr = 1;
15113 }
15114
15115 /* Ensure we won't overflow the length. */
15116 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15117 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015118 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015120 data = PyObject_MALLOC((length + 1) * char_size);
15121 if (data == NULL) {
15122 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015123 goto onError;
15124 }
15125
Victor Stinnerc3c74152011-10-02 20:39:55 +020015126 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015127 if (share_utf8) {
15128 _PyUnicode_UTF8_LENGTH(self) = length;
15129 _PyUnicode_UTF8(self) = data;
15130 }
15131 if (share_wstr) {
15132 _PyUnicode_WSTR_LENGTH(self) = length;
15133 _PyUnicode_WSTR(self) = (wchar_t *)data;
15134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015135
Christian Heimesf051e432016-09-13 20:22:02 +020015136 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015137 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015138 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015139#ifdef Py_DEBUG
15140 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15141#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015142 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015143 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015144
15145onError:
15146 Py_DECREF(unicode);
15147 Py_DECREF(self);
15148 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015149}
15150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015151PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015152"str(object='') -> str\n\
15153str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015154\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015155Create a new string object from the given object. If encoding or\n\
15156errors is specified, then the object must expose a data buffer\n\
15157that will be decoded using the given encoding and error handler.\n\
15158Otherwise, returns the result of object.__str__() (if defined)\n\
15159or repr(object).\n\
15160encoding defaults to sys.getdefaultencoding().\n\
15161errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015162
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015163static PyObject *unicode_iter(PyObject *seq);
15164
Guido van Rossumd57fd912000-03-10 22:53:23 +000015165PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015166 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015167 "str", /* tp_name */
15168 sizeof(PyUnicodeObject), /* tp_size */
15169 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015170 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015171 (destructor)unicode_dealloc, /* tp_dealloc */
15172 0, /* tp_print */
15173 0, /* tp_getattr */
15174 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015175 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 unicode_repr, /* tp_repr */
15177 &unicode_as_number, /* tp_as_number */
15178 &unicode_as_sequence, /* tp_as_sequence */
15179 &unicode_as_mapping, /* tp_as_mapping */
15180 (hashfunc) unicode_hash, /* tp_hash*/
15181 0, /* tp_call*/
15182 (reprfunc) unicode_str, /* tp_str */
15183 PyObject_GenericGetAttr, /* tp_getattro */
15184 0, /* tp_setattro */
15185 0, /* tp_as_buffer */
15186 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015187 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 unicode_doc, /* tp_doc */
15189 0, /* tp_traverse */
15190 0, /* tp_clear */
15191 PyUnicode_RichCompare, /* tp_richcompare */
15192 0, /* tp_weaklistoffset */
15193 unicode_iter, /* tp_iter */
15194 0, /* tp_iternext */
15195 unicode_methods, /* tp_methods */
15196 0, /* tp_members */
15197 0, /* tp_getset */
15198 &PyBaseObject_Type, /* tp_base */
15199 0, /* tp_dict */
15200 0, /* tp_descr_get */
15201 0, /* tp_descr_set */
15202 0, /* tp_dictoffset */
15203 0, /* tp_init */
15204 0, /* tp_alloc */
15205 unicode_new, /* tp_new */
15206 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015207};
15208
15209/* Initialize the Unicode implementation */
15210
Victor Stinner3a50e702011-10-18 21:21:00 +020015211int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015212{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015213 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015214 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015215 0x000A, /* LINE FEED */
15216 0x000D, /* CARRIAGE RETURN */
15217 0x001C, /* FILE SEPARATOR */
15218 0x001D, /* GROUP SEPARATOR */
15219 0x001E, /* RECORD SEPARATOR */
15220 0x0085, /* NEXT LINE */
15221 0x2028, /* LINE SEPARATOR */
15222 0x2029, /* PARAGRAPH SEPARATOR */
15223 };
15224
Fred Drakee4315f52000-05-09 19:53:39 +000015225 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015226 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015227 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015228 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015229 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015230
Guido van Rossumcacfc072002-05-24 19:01:59 +000015231 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015232 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015233
15234 /* initialize the linebreak bloom filter */
15235 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015236 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015237 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015238
Christian Heimes26532f72013-07-20 14:57:16 +020015239 if (PyType_Ready(&EncodingMapType) < 0)
15240 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015241
Benjamin Petersonc4311282012-10-30 23:21:10 -040015242 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15243 Py_FatalError("Can't initialize field name iterator type");
15244
15245 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15246 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015247
Victor Stinner3a50e702011-10-18 21:21:00 +020015248 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015249}
15250
15251/* Finalize the Unicode implementation */
15252
Christian Heimesa156e092008-02-16 07:38:31 +000015253int
15254PyUnicode_ClearFreeList(void)
15255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015256 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015257}
15258
Guido van Rossumd57fd912000-03-10 22:53:23 +000015259void
Thomas Wouters78890102000-07-22 19:25:51 +000015260_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015261{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015262 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015263
Serhiy Storchaka05997252013-01-26 12:14:02 +020015264 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015265
Serhiy Storchaka05997252013-01-26 12:14:02 +020015266 for (i = 0; i < 256; i++)
15267 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015268 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015269 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015270}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015271
Walter Dörwald16807132007-05-25 13:52:07 +000015272void
15273PyUnicode_InternInPlace(PyObject **p)
15274{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015275 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015277#ifdef Py_DEBUG
15278 assert(s != NULL);
15279 assert(_PyUnicode_CHECK(s));
15280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015282 return;
15283#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 /* If it's a subclass, we don't really know what putting
15285 it in the interned dict might do. */
15286 if (!PyUnicode_CheckExact(s))
15287 return;
15288 if (PyUnicode_CHECK_INTERNED(s))
15289 return;
15290 if (interned == NULL) {
15291 interned = PyDict_New();
15292 if (interned == NULL) {
15293 PyErr_Clear(); /* Don't leave an exception */
15294 return;
15295 }
15296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015298 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015300 if (t == NULL) {
15301 PyErr_Clear();
15302 return;
15303 }
15304 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015305 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015306 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015307 return;
15308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 /* The two references in interned are not counted by refcnt.
15310 The deallocator will take care of this */
15311 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015312 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015313}
15314
15315void
15316PyUnicode_InternImmortal(PyObject **p)
15317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015318 PyUnicode_InternInPlace(p);
15319 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015320 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 Py_INCREF(*p);
15322 }
Walter Dörwald16807132007-05-25 13:52:07 +000015323}
15324
15325PyObject *
15326PyUnicode_InternFromString(const char *cp)
15327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 PyObject *s = PyUnicode_FromString(cp);
15329 if (s == NULL)
15330 return NULL;
15331 PyUnicode_InternInPlace(&s);
15332 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015333}
15334
Alexander Belopolsky40018472011-02-26 01:02:56 +000015335void
15336_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015337{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015339 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 Py_ssize_t i, n;
15341 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015342
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 if (interned == NULL || !PyDict_Check(interned))
15344 return;
15345 keys = PyDict_Keys(interned);
15346 if (keys == NULL || !PyList_Check(keys)) {
15347 PyErr_Clear();
15348 return;
15349 }
Walter Dörwald16807132007-05-25 13:52:07 +000015350
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15352 detector, interned unicode strings are not forcibly deallocated;
15353 rather, we give them their stolen references back, and then clear
15354 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015355
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 n = PyList_GET_SIZE(keys);
15357 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015358 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015360 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015361 if (PyUnicode_READY(s) == -1) {
15362 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015363 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015365 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 case SSTATE_NOT_INTERNED:
15367 /* XXX Shouldn't happen */
15368 break;
15369 case SSTATE_INTERNED_IMMORTAL:
15370 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015371 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015372 break;
15373 case SSTATE_INTERNED_MORTAL:
15374 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015375 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015376 break;
15377 default:
15378 Py_FatalError("Inconsistent interned string state.");
15379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015380 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 }
15382 fprintf(stderr, "total size of all interned strings: "
15383 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15384 "mortal/immortal\n", mortal_size, immortal_size);
15385 Py_DECREF(keys);
15386 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015387 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015388}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015389
15390
15391/********************* Unicode Iterator **************************/
15392
15393typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015394 PyObject_HEAD
15395 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015396 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015397} unicodeiterobject;
15398
15399static void
15400unicodeiter_dealloc(unicodeiterobject *it)
15401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 _PyObject_GC_UNTRACK(it);
15403 Py_XDECREF(it->it_seq);
15404 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405}
15406
15407static int
15408unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 Py_VISIT(it->it_seq);
15411 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015412}
15413
15414static PyObject *
15415unicodeiter_next(unicodeiterobject *it)
15416{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015417 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 assert(it != NULL);
15420 seq = it->it_seq;
15421 if (seq == NULL)
15422 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015423 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015425 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15426 int kind = PyUnicode_KIND(seq);
15427 void *data = PyUnicode_DATA(seq);
15428 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15429 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 if (item != NULL)
15431 ++it->it_index;
15432 return item;
15433 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015434
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015436 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438}
15439
15440static PyObject *
15441unicodeiter_len(unicodeiterobject *it)
15442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 Py_ssize_t len = 0;
15444 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015445 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015447}
15448
15449PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15450
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451static PyObject *
15452unicodeiter_reduce(unicodeiterobject *it)
15453{
15454 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015455 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015456 it->it_seq, it->it_index);
15457 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015458 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015459 if (u == NULL)
15460 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015461 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015462 }
15463}
15464
15465PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15466
15467static PyObject *
15468unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15469{
15470 Py_ssize_t index = PyLong_AsSsize_t(state);
15471 if (index == -1 && PyErr_Occurred())
15472 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015473 if (it->it_seq != NULL) {
15474 if (index < 0)
15475 index = 0;
15476 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15477 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15478 it->it_index = index;
15479 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015480 Py_RETURN_NONE;
15481}
15482
15483PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15484
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015485static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015486 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015487 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015488 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15489 reduce_doc},
15490 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15491 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015492 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015493};
15494
15495PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15497 "str_iterator", /* tp_name */
15498 sizeof(unicodeiterobject), /* tp_basicsize */
15499 0, /* tp_itemsize */
15500 /* methods */
15501 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15502 0, /* tp_print */
15503 0, /* tp_getattr */
15504 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015505 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 0, /* tp_repr */
15507 0, /* tp_as_number */
15508 0, /* tp_as_sequence */
15509 0, /* tp_as_mapping */
15510 0, /* tp_hash */
15511 0, /* tp_call */
15512 0, /* tp_str */
15513 PyObject_GenericGetAttr, /* tp_getattro */
15514 0, /* tp_setattro */
15515 0, /* tp_as_buffer */
15516 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15517 0, /* tp_doc */
15518 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15519 0, /* tp_clear */
15520 0, /* tp_richcompare */
15521 0, /* tp_weaklistoffset */
15522 PyObject_SelfIter, /* tp_iter */
15523 (iternextfunc)unicodeiter_next, /* tp_iternext */
15524 unicodeiter_methods, /* tp_methods */
15525 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015526};
15527
15528static PyObject *
15529unicode_iter(PyObject *seq)
15530{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015532
Benjamin Peterson14339b62009-01-31 16:36:08 +000015533 if (!PyUnicode_Check(seq)) {
15534 PyErr_BadInternalCall();
15535 return NULL;
15536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015537 if (PyUnicode_READY(seq) == -1)
15538 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15540 if (it == NULL)
15541 return NULL;
15542 it->it_index = 0;
15543 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015544 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015545 _PyObject_GC_TRACK(it);
15546 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015547}
15548
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015549
15550size_t
15551Py_UNICODE_strlen(const Py_UNICODE *u)
15552{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015553 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015554}
15555
15556Py_UNICODE*
15557Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15558{
15559 Py_UNICODE *u = s1;
15560 while ((*u++ = *s2++));
15561 return s1;
15562}
15563
15564Py_UNICODE*
15565Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15566{
15567 Py_UNICODE *u = s1;
15568 while ((*u++ = *s2++))
15569 if (n-- == 0)
15570 break;
15571 return s1;
15572}
15573
15574Py_UNICODE*
15575Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15576{
15577 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015578 u1 += wcslen(u1);
15579 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015580 return s1;
15581}
15582
15583int
15584Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15585{
15586 while (*s1 && *s2 && *s1 == *s2)
15587 s1++, s2++;
15588 if (*s1 && *s2)
15589 return (*s1 < *s2) ? -1 : +1;
15590 if (*s1)
15591 return 1;
15592 if (*s2)
15593 return -1;
15594 return 0;
15595}
15596
15597int
15598Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15599{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015600 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015601 for (; n != 0; n--) {
15602 u1 = *s1;
15603 u2 = *s2;
15604 if (u1 != u2)
15605 return (u1 < u2) ? -1 : +1;
15606 if (u1 == '\0')
15607 return 0;
15608 s1++;
15609 s2++;
15610 }
15611 return 0;
15612}
15613
15614Py_UNICODE*
15615Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15616{
15617 const Py_UNICODE *p;
15618 for (p = s; *p; p++)
15619 if (*p == c)
15620 return (Py_UNICODE*)p;
15621 return NULL;
15622}
15623
15624Py_UNICODE*
15625Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15626{
15627 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015628 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015629 while (p != s) {
15630 p--;
15631 if (*p == c)
15632 return (Py_UNICODE*)p;
15633 }
15634 return NULL;
15635}
Victor Stinner331ea922010-08-10 16:37:20 +000015636
Victor Stinner71133ff2010-09-01 23:43:53 +000015637Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015638PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015639{
Victor Stinner577db2c2011-10-11 22:12:48 +020015640 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015641 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015643 if (!PyUnicode_Check(unicode)) {
15644 PyErr_BadArgument();
15645 return NULL;
15646 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015647 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015648 if (u == NULL)
15649 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015650 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015651 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015652 PyErr_NoMemory();
15653 return NULL;
15654 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015655 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015656 size *= sizeof(Py_UNICODE);
15657 copy = PyMem_Malloc(size);
15658 if (copy == NULL) {
15659 PyErr_NoMemory();
15660 return NULL;
15661 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015662 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015663 return copy;
15664}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015665
Georg Brandl66c221e2010-10-14 07:04:07 +000015666/* A _string module, to export formatter_parser and formatter_field_name_split
15667 to the string.Formatter class implemented in Python. */
15668
15669static PyMethodDef _string_methods[] = {
15670 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15671 METH_O, PyDoc_STR("split the argument as a field name")},
15672 {"formatter_parser", (PyCFunction) formatter_parser,
15673 METH_O, PyDoc_STR("parse the argument as a format string")},
15674 {NULL, NULL}
15675};
15676
15677static struct PyModuleDef _string_module = {
15678 PyModuleDef_HEAD_INIT,
15679 "_string",
15680 PyDoc_STR("string helper module"),
15681 0,
15682 _string_methods,
15683 NULL,
15684 NULL,
15685 NULL,
15686 NULL
15687};
15688
15689PyMODINIT_FUNC
15690PyInit__string(void)
15691{
15692 return PyModule_Create(&_string_module);
15693}
15694
15695
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015696#ifdef __cplusplus
15697}
15698#endif